package main // Copyright (C) 2021, Maxim Lihachev, // // This program is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, version 3. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . import ( "os" "fmt" "flag" "sync" "regexp" "strings" "strconv" "net/http" "path/filepath" "html/template" "github.com/gorilla/mux" "github.com/PuerkitoBio/goquery" ) // -------------------------------------------------------------------------------- // Default options. const ( defaultLanguage = "ru" defaultHost = "127.0.0.1" defaultPort = 9000 webDirectory = "web" tpls = webDirectory + "/tpl/*.html" assets = webDirectory + "/assets/" defaultTemplate = "page" maxTextLength = 100 perseus = "http://www.perseus.tufts.edu/hopper/morph" regexpMarks = `[\d*?()$.,!"'·;:]+` regexpDelimiters = `\n+` ) // -------------------------------------------------------------------------------- type PageData struct { Page string Lang string Data []Word } type Word struct { Word string Url string Mark bool Delimiter bool Analysis []Analysis } type Analysis struct { Variant string Translation string Forms []Form } type Form struct { Form string Grammar string } type Queue struct { rest int mutex sync.Mutex } // -------------------------------------------------------------------------------- // printPageData prints grammatic analysis with indentations. // Used as output formatter for CLI interface for Lexis. func printPageData(data PageData) { for _, w := range data.Data { fmt.Printf("\n\033[1;32m>>> %s\033[0m\n", w.Word) for i, grammar := range w.Analysis { fmt.Printf("\n %d) \033[35m%s\033[0m / %s\n", i+1, grammar.Variant, grammar.Translation) for _, form := range grammar.Forms { fmt.Printf(" > \033[36m%s\033[0m / %s\n", form.Form, form.Grammar) } } } } // -------------------------------------------------------------------------------- // recognizeLanguage returns possibly language of word. // At this moment it can be Greek or Latin. func recognizeLanguage (word string) string { // r := regexp.MustCompile(`^.*[α-ωΑ-Ω]+.*$`) r := regexp.MustCompile(`^.*[a-zA-Z]+.*$`) if r.MatchString(word) { return "lat" } else { return "greek" } } // perseusUrl makes full URI for Perseus morph's page. func perseusUrl (word string) string { return perseus + "?l=" + word + "&la="+ recognizeLanguage(word) } // excludeElementText seeks for element on the page and returns its inner text. func excludeElementText (selection *goquery.Selection, path string) string { return strings.TrimSpace(selection.Find(path).Text()) } // splitElements splits found HTML elements into list. func splitElements (selection *goquery.Selection) []string { list := make([]string, selection.Size()) selection.Each(func(i int, element *goquery.Selection) { list[i] = strings.TrimSpace(element.Text()) }) return list } // parseWords gets possible definitions, translations and grammar analysis from Perseus. func parseWord (word string, collector chan Word, queue *Queue) { w := Word{ Word: word, Url: perseusUrl(word), Analysis: []Analysis{}} r := regexp.MustCompile(regexpMarks) n := regexp.MustCompile(regexpDelimiters) if r.MatchString(word) { w.Mark = true } else if n.MatchString(word) { w.Delimiter = true } if !w.Delimiter && !w.Mark { document, _ := goquery.NewDocument(perseusUrl(word)) document.Find("div.analysis").Each(func(i int, analysis *goquery.Selection) { variant := excludeElementText(analysis, "h4") translation := excludeElementText(analysis, ".lemma_definition") f := []Form{} analysis.Find("table tr").Each(func(i int, tr *goquery.Selection) { tds := splitElements(tr.Find("td")) form, grammar := tds[0], tds[1] f = append(f, Form{form, grammar}) }) w.Analysis = append(w.Analysis, Analysis{variant, translation, f}) }) } queue.mutex.Lock() queue.rest -= 1 collector <- w if queue.rest <= 0 { close(collector) } queue.mutex.Unlock() } // parseTokens gets analysis for each word in a list concurrently. func parseTokens (text string) map[string]Word { words := make(map[string]Word) tokens := tokenize(text) queue := Queue{rest: len(tokens)} collector := make(chan Word, queue.rest) for _, w := range tokens { go parseWord(w, collector, &queue) } for w := range collector { words[w.Word] = w } return words } // parseText starts parsing of each word in the text and returns PageData structure. func parseText (text string) PageData { textForParsing := splitText(text) data := PageData{ Page: "parse" } if len(textForParsing) > maxTextLength { textForParsing = textForParsing[:maxTextLength] } forms := parseTokens(strings.Join(textForParsing, " ")) for _, w := range textForParsing { form := forms[strings.ToLower(w)] form.Word = w data.Data = append(data.Data, form) } return data } // splitText returns list of words and punctuation marks. func splitText (logion string) []string { re := regexp.MustCompile(`[\p{L}]+|` + regexpMarks + `|` + regexpDelimiters) return re.FindAllString(logion, -1) } // tokenize returns list of unique words and punctuations in lower case. func tokenize (logion string) []string { tokens := make(map[string]bool) for _, e := range splitText(logion) { q := strings.ToLower(e) if _, exists := tokens[q]; !exists && q != "" { tokens[q] = true } } var result []string for item, _ := range tokens { result = append(result, item) } return result } // -------------------------------------------------------------------------------- // renderPage fills in Go HTML template. func renderPage(w http.ResponseWriter, data PageData) { t, _ := template.ParseGlob(tpls) t.ExecuteTemplate(w, defaultTemplate, data) } // srvDefault redirects to index page in default language func srvDefault(w http.ResponseWriter, r *http.Request) { renderPage(w, PageData{ Page: "index", Lang: defaultLanguage }) } // notFound handler redirects user to index page func notFound(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "/", http.StatusSeeOther) } // srvParse shows parse.html page. func srvParse(w http.ResponseWriter, r *http.Request) { r.ParseForm() fmt.Println("PARSE:") fmt.Println(r.Form) fmt.Println(r.Form["url_long"]) vars := mux.Vars(r) if text, ok := r.Form["text"]; ok { t := strings.TrimSpace(strings.Join(text, "")) if strings.TrimSpace(t) != "" { data := parseText(t) data.Lang = vars["lang"] renderPage(w, data) } else { http.Redirect(w, r, "/" + vars["lang"] + "/", http.StatusSeeOther) } } } // srvPage func srvPage(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) switch vars["page"] { case "parse": srvParse(w, r) case "about": renderPage(w, PageData{ Page: "about", Lang: vars["lang"] }) case "": renderPage(w, PageData{ Page: "index", Lang: vars["lang"] }) default: srvDefault(w, r) } } // srv runs server on specified port. func srv(host string, port int) { fmt.Println("Server is started at " + host + ":" + strconv.Itoa(port) + ".") router := mux.NewRouter() http.Handle("/assets/", http.StripPrefix("/assets/", http.FileServer(http.Dir(assets)))) router.HandleFunc("/", srvDefault) router.HandleFunc("/{lang}/", srvPage) router.HandleFunc("/{lang}/{page}/", srvPage) router.NotFoundHandler = http.HandlerFunc(notFound) http.Handle("/", router) err := http.ListenAndServe(host + ":" + strconv.Itoa(port), nil) if err != nil { fmt.Println("ListenAndServe: ", err) } } // -------------------------------------------------------------------------------- func showHelp() { program := filepath.Base(os.Args[0]) fmt.Printf("Usage:\n") fmt.Printf(" CLI: %s \n", program) fmt.Printf(" WEB: %s --serve [--host] [--port]\n\n", program) flag.PrintDefaults() } func main() { flag.Usage = showHelp host := flag.String("host", defaultHost, "Host to bind") port := flag.Int("port", defaultPort, "Server port") serve := flag.Bool("serve", false, "Serve LEΞΙΣ") flag.Parse() if (*serve) { srv(*host, *port) } else if (len(flag.Args()) > 0) { txt := strings.Join(flag.Args(), " ") fmt.Printf("<< %s >>\n", txt) printPageData(parseText(txt)) fmt.Println("") } else { flag.Usage() } }