A semi-automatic parser of Ancient Greek and Latin languages.
https://lexis.glossa.info/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
366 lines
8.6 KiB
366 lines
8.6 KiB
package main |
|
|
|
// Copyright (C) 2021, Maxim Lihachev, <envrm@yandex.ru> |
|
// |
|
// This program is free software: you can redistribute it and/or modify it |
|
// under the terms of the GNU General Public License as published by the Free |
|
// Software Foundation, version 3. |
|
// |
|
// This program is distributed in the hope that it will be useful, |
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
// GNU General Public License for more details. |
|
// |
|
// You should have received a copy of the GNU General Public License |
|
// along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
|
|
import ( |
|
"os" |
|
"fmt" |
|
"flag" |
|
"sync" |
|
"regexp" |
|
"strings" |
|
"strconv" |
|
"net/http" |
|
"path/filepath" |
|
"html/template" |
|
"github.com/gorilla/mux" |
|
"github.com/PuerkitoBio/goquery" |
|
) |
|
|
|
// -------------------------------------------------------------------------------- |
|
|
|
// Default options. |
|
const ( |
|
defaultLanguage = "ru" |
|
|
|
defaultHost = "127.0.0.1" |
|
defaultPort = 9000 |
|
|
|
webDirectory = "web" |
|
tpls = webDirectory + "/tpl/*.html" |
|
assets = webDirectory + "/assets/" |
|
defaultTemplate = "page" |
|
maxTextLength = 100 |
|
|
|
perseus = "http://www.perseus.tufts.edu/hopper/morph" |
|
|
|
regexpMarks = `[\d*?()$.,!"'·;:]+` |
|
regexpDelimiters = `\n+` |
|
) |
|
|
|
// -------------------------------------------------------------------------------- |
|
|
|
type PageData struct { |
|
Page string |
|
Lang string |
|
Data []Word |
|
} |
|
|
|
type Word struct { |
|
Word string |
|
Url string |
|
Mark bool |
|
Delimiter bool |
|
Analysis []Analysis |
|
} |
|
|
|
type Analysis struct { |
|
Variant string |
|
Translation string |
|
Forms []Form |
|
} |
|
|
|
type Form struct { |
|
Form string |
|
Grammar string |
|
} |
|
|
|
type Queue struct { |
|
rest int |
|
mutex sync.Mutex |
|
} |
|
|
|
// -------------------------------------------------------------------------------- |
|
|
|
// printPageData prints grammatic analysis with indentations. |
|
// Used as output formatter for CLI interface for Lexis. |
|
func printPageData(data PageData) { |
|
for _, w := range data.Data { |
|
fmt.Printf("\n\033[1;32m>>> %s\033[0m\n", w.Word) |
|
|
|
for i, grammar := range w.Analysis { |
|
fmt.Printf("\n %d) \033[35m%s\033[0m / %s\n", i+1, grammar.Variant, grammar.Translation) |
|
|
|
for _, form := range grammar.Forms { |
|
fmt.Printf(" > \033[36m%s\033[0m / %s\n", form.Form, form.Grammar) |
|
} |
|
} |
|
} |
|
} |
|
|
|
// -------------------------------------------------------------------------------- |
|
|
|
// recognizeLanguage returns possibly language of word. |
|
// At this moment it can be Greek or Latin. |
|
func recognizeLanguage (word string) string { |
|
// r := regexp.MustCompile(`^.*[α-ωΑ-Ω]+.*$`) |
|
r := regexp.MustCompile(`^.*[a-zA-Z]+.*$`) |
|
|
|
if r.MatchString(word) { |
|
return "lat" |
|
} else { |
|
return "greek" |
|
} |
|
} |
|
|
|
// perseusUrl makes full URI for Perseus morph's page. |
|
func perseusUrl (word string) string { |
|
return perseus + "?l=" + word + "&la="+ recognizeLanguage(word) |
|
} |
|
|
|
// excludeElementText seeks for element on the page and returns its inner text. |
|
func excludeElementText (selection *goquery.Selection, path string) string { |
|
return strings.TrimSpace(selection.Find(path).Text()) |
|
} |
|
|
|
// splitElements splits found HTML elements into list. |
|
func splitElements (selection *goquery.Selection) []string { |
|
list := make([]string, selection.Size()) |
|
|
|
selection.Each(func(i int, element *goquery.Selection) { |
|
list[i] = strings.TrimSpace(element.Text()) |
|
}) |
|
|
|
return list |
|
} |
|
|
|
// parseWords gets possible definitions, translations and grammar analysis from Perseus. |
|
func parseWord (word string, collector chan Word, queue *Queue) { |
|
w := Word{ Word: word, Url: perseusUrl(word), Analysis: []Analysis{}} |
|
|
|
r := regexp.MustCompile(regexpMarks) |
|
n := regexp.MustCompile(regexpDelimiters) |
|
|
|
if r.MatchString(word) { |
|
w.Mark = true |
|
} else if n.MatchString(word) { |
|
w.Delimiter = true |
|
} |
|
|
|
if !w.Delimiter && !w.Mark { |
|
document, _ := goquery.NewDocument(perseusUrl(word)) |
|
|
|
document.Find("div.analysis").Each(func(i int, analysis *goquery.Selection) { |
|
variant := excludeElementText(analysis, "h4") |
|
translation := excludeElementText(analysis, ".lemma_definition") |
|
|
|
f := []Form{} |
|
|
|
analysis.Find("table tr").Each(func(i int, tr *goquery.Selection) { |
|
tds := splitElements(tr.Find("td")) |
|
form, grammar := tds[0], tds[1] |
|
|
|
f = append(f, Form{form, grammar}) |
|
}) |
|
|
|
w.Analysis = append(w.Analysis, Analysis{variant, translation, f}) |
|
}) |
|
} |
|
|
|
queue.mutex.Lock() |
|
queue.rest -= 1 |
|
|
|
collector <- w |
|
|
|
if queue.rest <= 0 { |
|
close(collector) |
|
} |
|
queue.mutex.Unlock() |
|
} |
|
|
|
// parseTokens gets analysis for each word in a list concurrently. |
|
func parseTokens (text string) map[string]Word { |
|
words := make(map[string]Word) |
|
|
|
tokens := tokenize(text) |
|
|
|
queue := Queue{rest: len(tokens)} |
|
|
|
collector := make(chan Word, queue.rest) |
|
|
|
for _, w := range tokens { |
|
go parseWord(w, collector, &queue) |
|
} |
|
|
|
for w := range collector { |
|
words[w.Word] = w |
|
} |
|
|
|
return words |
|
} |
|
|
|
// parseText starts parsing of each word in the text and returns PageData structure. |
|
func parseText (text string) PageData { |
|
textForParsing := splitText(text) |
|
|
|
data := PageData{ Page: "parse" } |
|
|
|
if len(textForParsing) > maxTextLength { |
|
textForParsing = textForParsing[:maxTextLength] |
|
} |
|
|
|
forms := parseTokens(strings.Join(textForParsing, " ")) |
|
|
|
for _, w := range textForParsing { |
|
form := forms[strings.ToLower(w)] |
|
|
|
form.Word = w |
|
|
|
data.Data = append(data.Data, form) |
|
} |
|
|
|
return data |
|
} |
|
|
|
// splitText returns list of words and punctuation marks. |
|
func splitText (logion string) []string { |
|
re := regexp.MustCompile(`[\p{L}]+|` + regexpMarks + `|` + regexpDelimiters) |
|
return re.FindAllString(logion, -1) |
|
} |
|
|
|
// tokenize returns list of unique words and punctuations in lower case. |
|
func tokenize (logion string) []string { |
|
tokens := make(map[string]bool) |
|
|
|
for _, e := range splitText(logion) { |
|
q := strings.ToLower(e) |
|
|
|
if _, exists := tokens[q]; !exists && q != "" { |
|
tokens[q] = true |
|
} |
|
} |
|
|
|
var result []string |
|
for item, _ := range tokens { |
|
result = append(result, item) |
|
} |
|
return result |
|
} |
|
|
|
// -------------------------------------------------------------------------------- |
|
|
|
// renderPage fills in Go HTML template. |
|
func renderPage(w http.ResponseWriter, data PageData) { |
|
t, _ := template.ParseGlob(tpls) |
|
t.ExecuteTemplate(w, defaultTemplate, data) |
|
} |
|
|
|
// srvDefault redirects to index page in default language |
|
func srvDefault(w http.ResponseWriter, r *http.Request) { |
|
renderPage(w, PageData{ Page: "index", Lang: defaultLanguage }) |
|
} |
|
|
|
// notFound handler redirects user to index page |
|
func notFound(w http.ResponseWriter, r *http.Request) { |
|
http.Redirect(w, r, "/", http.StatusSeeOther) |
|
} |
|
|
|
// srvParse shows parse.html page. |
|
func srvParse(w http.ResponseWriter, r *http.Request) { |
|
r.ParseForm() |
|
fmt.Println("PARSE:") |
|
fmt.Println(r.Form) |
|
fmt.Println(r.Form["url_long"]) |
|
|
|
vars := mux.Vars(r) |
|
|
|
if text, ok := r.Form["text"]; ok { |
|
t := strings.TrimSpace(strings.Join(text, "")) |
|
|
|
if strings.TrimSpace(t) != "" { |
|
data := parseText(t) |
|
|
|
data.Lang = vars["lang"] |
|
|
|
renderPage(w, data) |
|
} else { |
|
http.Redirect(w, r, "/" + vars["lang"] + "/", http.StatusSeeOther) |
|
} |
|
} |
|
} |
|
|
|
// srvPage |
|
func srvPage(w http.ResponseWriter, r *http.Request) { |
|
vars := mux.Vars(r) |
|
|
|
switch vars["page"] { |
|
case "parse": |
|
srvParse(w, r) |
|
case "about": |
|
renderPage(w, PageData{ Page: "about", Lang: vars["lang"] }) |
|
case "": |
|
renderPage(w, PageData{ Page: "index", Lang: vars["lang"] }) |
|
default: |
|
srvDefault(w, r) |
|
} |
|
} |
|
|
|
// srv runs server on specified port. |
|
func srv(host string, port int) { |
|
fmt.Println("Server is started at " + host + ":" + strconv.Itoa(port) + ".") |
|
|
|
router := mux.NewRouter() |
|
|
|
http.Handle("/assets/", http.StripPrefix("/assets/", http.FileServer(http.Dir(assets)))) |
|
|
|
router.HandleFunc("/", srvDefault) |
|
router.HandleFunc("/{lang}/", srvPage) |
|
router.HandleFunc("/{lang}/{page}/", srvPage) |
|
|
|
router.NotFoundHandler = http.HandlerFunc(notFound) |
|
|
|
http.Handle("/", router) |
|
|
|
err := http.ListenAndServe(host + ":" + strconv.Itoa(port), nil) |
|
|
|
if err != nil { |
|
fmt.Println("ListenAndServe: ", err) |
|
} |
|
} |
|
|
|
// -------------------------------------------------------------------------------- |
|
|
|
func showHelp() { |
|
program := filepath.Base(os.Args[0]) |
|
|
|
fmt.Printf("Usage:\n") |
|
fmt.Printf(" CLI: %s <text>\n", program) |
|
fmt.Printf(" WEB: %s --serve [--host] [--port]\n\n", program) |
|
|
|
flag.PrintDefaults() |
|
} |
|
|
|
func main() { |
|
flag.Usage = showHelp |
|
|
|
host := flag.String("host", defaultHost, "Host to bind") |
|
port := flag.Int("port", defaultPort, "Server port") |
|
serve := flag.Bool("serve", false, "Serve LEΞΙΣ") |
|
|
|
flag.Parse() |
|
|
|
if (*serve) { |
|
srv(*host, *port) |
|
} else if (len(flag.Args()) > 0) { |
|
txt := strings.Join(flag.Args(), " ") |
|
|
|
fmt.Printf("<< %s >>\n", txt) |
|
printPageData(parseText(txt)) |
|
fmt.Println("") |
|
} else { |
|
flag.Usage() |
|
} |
|
} |
|
|
|
|