A semi-automatic parser of Ancient Greek and Latin languages. https://lexis.glossa.info/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

366 lines
8.6 KiB

package main
// Copyright (C) 2021, Maxim Lihachev, <envrm@yandex.ru>
//
// This program is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, version 3.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
import (
"os"
"fmt"
"flag"
"sync"
"regexp"
"strings"
"strconv"
"net/http"
"path/filepath"
"html/template"
"github.com/gorilla/mux"
"github.com/PuerkitoBio/goquery"
)
// --------------------------------------------------------------------------------
// Default options.
const (
defaultLanguage = "ru"
defaultHost = "127.0.0.1"
defaultPort = 9000
webDirectory = "web"
tpls = webDirectory + "/tpl/*.html"
assets = webDirectory + "/assets/"
defaultTemplate = "page"
maxTextLength = 100
perseus = "http://www.perseus.tufts.edu/hopper/morph"
regexpMarks = `[\d*?()$.,!"'·;:]+`
regexpDelimiters = `\n+`
)
// --------------------------------------------------------------------------------
type PageData struct {
Page string
Lang string
Data []Word
}
type Word struct {
Word string
Url string
Mark bool
Delimiter bool
Analysis []Analysis
}
type Analysis struct {
Variant string
Translation string
Forms []Form
}
type Form struct {
Form string
Grammar string
}
type Queue struct {
rest int
mutex sync.Mutex
}
// --------------------------------------------------------------------------------
// printPageData prints grammatic analysis with indentations.
// Used as output formatter for CLI interface for Lexis.
func printPageData(data PageData) {
for _, w := range data.Data {
fmt.Printf("\n\033[1;32m>>> %s\033[0m\n", w.Word)
for i, grammar := range w.Analysis {
fmt.Printf("\n %d) \033[35m%s\033[0m / %s\n", i+1, grammar.Variant, grammar.Translation)
for _, form := range grammar.Forms {
fmt.Printf(" > \033[36m%s\033[0m / %s\n", form.Form, form.Grammar)
}
}
}
}
// --------------------------------------------------------------------------------
// recognizeLanguage returns possibly language of word.
// At this moment it can be Greek or Latin.
func recognizeLanguage (word string) string {
// r := regexp.MustCompile(`^.*[α-ωΑ-Ω]+.*$`)
r := regexp.MustCompile(`^.*[a-zA-Z]+.*$`)
if r.MatchString(word) {
return "lat"
} else {
return "greek"
}
}
// perseusUrl makes full URI for Perseus morph's page.
func perseusUrl (word string) string {
return perseus + "?l=" + word + "&la="+ recognizeLanguage(word)
}
// excludeElementText seeks for element on the page and returns its inner text.
func excludeElementText (selection *goquery.Selection, path string) string {
return strings.TrimSpace(selection.Find(path).Text())
}
// splitElements splits found HTML elements into list.
func splitElements (selection *goquery.Selection) []string {
list := make([]string, selection.Size())
selection.Each(func(i int, element *goquery.Selection) {
list[i] = strings.TrimSpace(element.Text())
})
return list
}
// parseWords gets possible definitions, translations and grammar analysis from Perseus.
func parseWord (word string, collector chan Word, queue *Queue) {
w := Word{ Word: word, Url: perseusUrl(word), Analysis: []Analysis{}}
r := regexp.MustCompile(regexpMarks)
n := regexp.MustCompile(regexpDelimiters)
if r.MatchString(word) {
w.Mark = true
} else if n.MatchString(word) {
w.Delimiter = true
}
if !w.Delimiter && !w.Mark {
document, _ := goquery.NewDocument(perseusUrl(word))
document.Find("div.analysis").Each(func(i int, analysis *goquery.Selection) {
variant := excludeElementText(analysis, "h4")
translation := excludeElementText(analysis, ".lemma_definition")
f := []Form{}
analysis.Find("table tr").Each(func(i int, tr *goquery.Selection) {
tds := splitElements(tr.Find("td"))
form, grammar := tds[0], tds[1]
f = append(f, Form{form, grammar})
})
w.Analysis = append(w.Analysis, Analysis{variant, translation, f})
})
}
queue.mutex.Lock()
queue.rest -= 1
collector <- w
if queue.rest <= 0 {
close(collector)
}
queue.mutex.Unlock()
}
// parseTokens gets analysis for each word in a list concurrently.
func parseTokens (text string) map[string]Word {
words := make(map[string]Word)
tokens := tokenize(text)
queue := Queue{rest: len(tokens)}
collector := make(chan Word, queue.rest)
for _, w := range tokens {
go parseWord(w, collector, &queue)
}
for w := range collector {
words[w.Word] = w
}
return words
}
// parseText starts parsing of each word in the text and returns PageData structure.
func parseText (text string) PageData {
textForParsing := splitText(text)
data := PageData{ Page: "parse" }
if len(textForParsing) > maxTextLength {
textForParsing = textForParsing[:maxTextLength]
}
forms := parseTokens(strings.Join(textForParsing, " "))
for _, w := range textForParsing {
form := forms[strings.ToLower(w)]
form.Word = w
data.Data = append(data.Data, form)
}
return data
}
// splitText returns list of words and punctuation marks.
func splitText (logion string) []string {
re := regexp.MustCompile(`[\p{L}]+|` + regexpMarks + `|` + regexpDelimiters)
return re.FindAllString(logion, -1)
}
// tokenize returns list of unique words and punctuations in lower case.
func tokenize (logion string) []string {
tokens := make(map[string]bool)
for _, e := range splitText(logion) {
q := strings.ToLower(e)
if _, exists := tokens[q]; !exists && q != "" {
tokens[q] = true
}
}
var result []string
for item, _ := range tokens {
result = append(result, item)
}
return result
}
// --------------------------------------------------------------------------------
// renderPage fills in Go HTML template.
func renderPage(w http.ResponseWriter, data PageData) {
t, _ := template.ParseGlob(tpls)
t.ExecuteTemplate(w, defaultTemplate, data)
}
// srvDefault redirects to index page in default language
func srvDefault(w http.ResponseWriter, r *http.Request) {
renderPage(w, PageData{ Page: "index", Lang: defaultLanguage })
}
// notFound handler redirects user to index page
func notFound(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/", http.StatusSeeOther)
}
// srvParse shows parse.html page.
func srvParse(w http.ResponseWriter, r *http.Request) {
r.ParseForm()
fmt.Println("PARSE:")
fmt.Println(r.Form)
fmt.Println(r.Form["url_long"])
vars := mux.Vars(r)
if text, ok := r.Form["text"]; ok {
t := strings.TrimSpace(strings.Join(text, ""))
if strings.TrimSpace(t) != "" {
data := parseText(t)
data.Lang = vars["lang"]
renderPage(w, data)
} else {
http.Redirect(w, r, "/" + vars["lang"] + "/", http.StatusSeeOther)
}
}
}
// srvPage
func srvPage(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
switch vars["page"] {
case "parse":
srvParse(w, r)
case "about":
renderPage(w, PageData{ Page: "about", Lang: vars["lang"] })
case "":
renderPage(w, PageData{ Page: "index", Lang: vars["lang"] })
default:
srvDefault(w, r)
}
}
// srv runs server on specified port.
func srv(host string, port int) {
fmt.Println("Server is started at " + host + ":" + strconv.Itoa(port) + ".")
router := mux.NewRouter()
http.Handle("/assets/", http.StripPrefix("/assets/", http.FileServer(http.Dir(assets))))
router.HandleFunc("/", srvDefault)
router.HandleFunc("/{lang}/", srvPage)
router.HandleFunc("/{lang}/{page}/", srvPage)
router.NotFoundHandler = http.HandlerFunc(notFound)
http.Handle("/", router)
err := http.ListenAndServe(host + ":" + strconv.Itoa(port), nil)
if err != nil {
fmt.Println("ListenAndServe: ", err)
}
}
// --------------------------------------------------------------------------------
func showHelp() {
program := filepath.Base(os.Args[0])
fmt.Printf("Usage:\n")
fmt.Printf(" CLI: %s <text>\n", program)
fmt.Printf(" WEB: %s --serve [--host] [--port]\n\n", program)
flag.PrintDefaults()
}
func main() {
flag.Usage = showHelp
host := flag.String("host", defaultHost, "Host to bind")
port := flag.Int("port", defaultPort, "Server port")
serve := flag.Bool("serve", false, "Serve LEΞΙΣ")
flag.Parse()
if (*serve) {
srv(*host, *port)
} else if (len(flag.Args()) > 0) {
txt := strings.Join(flag.Args(), " ")
fmt.Printf("<< %s >>\n", txt)
printPageData(parseText(txt))
fmt.Println("")
} else {
flag.Usage()
}
}