commit
0b55f3cc43
13 changed files with 1134 additions and 0 deletions
@ -0,0 +1,25 @@
@@ -0,0 +1,25 @@
|
||||
.stack-work/ |
||||
json-search-server.cabal |
||||
|
||||
*~ |
||||
dist |
||||
dist-* |
||||
cabal-dev |
||||
*.o |
||||
*.hi |
||||
*.chi |
||||
*.chs.h |
||||
*.dyn_o |
||||
*.dyn_hi |
||||
.hpc |
||||
.hsenv |
||||
.cabal-sandbox/ |
||||
cabal.sandbox.config |
||||
*.prof |
||||
*.aux |
||||
*.hp |
||||
*.eventlog |
||||
cabal.project.local |
||||
.HTF/ |
||||
.ghc.environment.* |
||||
|
@ -0,0 +1,38 @@
@@ -0,0 +1,38 @@
|
||||
application = json-search-server
|
||||
stack = stack
|
||||
hlint = hlint
|
||||
|
||||
all: app docs |
||||
|
||||
build: app |
||||
|
||||
deps: |
||||
$(stack) build --only-dependencies
|
||||
|
||||
app: |
||||
$(stack) build
|
||||
|
||||
install: |
||||
$(stack) install
|
||||
|
||||
docs: |
||||
$(stack) exec -- haddock --html --pretty-html src/*.hs app/Main.hs --hyperlinked-source --optghc=-DHADDOCK --odir=docs --show-all --use-unicode
|
||||
|
||||
lint: hlint |
||||
hlint: |
||||
$(hlint) .; :
|
||||
|
||||
check: |
||||
$(stack) build --force-dirty --ghc-options="-Weverything"
|
||||
|
||||
static: |
||||
$(stack) build --force-dirty --ghc-options="-optl-static -optc-static"
|
||||
|
||||
exec: |
||||
$(stack) exec -- $(application) $(filter-out $@, $(MAKECMDGOALS))
|
||||
|
||||
%: |
||||
@true
|
||||
|
||||
.PHONY: build app docs all hlint static exec check install |
||||
|
@ -0,0 +1,326 @@
@@ -0,0 +1,326 @@
|
||||
# Json Search Server |
||||
|
||||
**v0.1.0** |
||||
|
||||
|
||||
This application provides fuzzy search server for data stored in JSON format. |
||||
|
||||
|
||||
The purpose of the development of this application was to use it with the website written on [Jekyll](https://jekyllrb.com/). |
||||
|
||||
--- |
||||
|
||||
## License |
||||
``` |
||||
Copyright (C) 2019, Maxim Lihachev, <envrm@yandex.ru> |
||||
|
||||
This program is free software: you can redistribute it and/or modify |
||||
it under the terms of the GNU General Public License as published by |
||||
the Free Software Foundation, either version 3 of the License, or |
||||
(at your option) any later version. |
||||
|
||||
This program is distributed in the hope that it will be useful, |
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
GNU General Public License for more details. |
||||
|
||||
You should have received a copy of the GNU General Public License |
||||
along with this program. If not, see <https://www.gnu.org/licenses/>. |
||||
``` |
||||
|
||||
# Algorithm |
||||
|
||||
The logic of this application is pretty simple: |
||||
* All fields in the file are divided into separate words. |
||||
* Each unique word in the search string is compared to the words in the file. |
||||
* If the match is exact, the highest score is awarded. |
||||
* If there is an inaccurate coincidence, the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) is calculated and the word score is formed on its basis. |
||||
* Results are sorted in descending order of accuracy. |
||||
|
||||
|
||||
# How to use |
||||
|
||||
## Build |
||||
|
||||
``` |
||||
$ make build |
||||
or |
||||
$ make static |
||||
``` |
||||
|
||||
There is [Vagrant](https://www.vagrantup.com/) file configured for starting Centos 7. This VM is used for making statically-linked binary file for target server. |
||||
|
||||
``` |
||||
host:~$ vagrant up |
||||
host:~$ vagrant ssh |
||||
|
||||
vm:~$ cd /vagrant |
||||
vm:~$ make static |
||||
``` |
||||
|
||||
Static executable file is in bin/ directory. |
||||
|
||||
## Run |
||||
``` |
||||
$ make exec |
||||
|
||||
json-search-server v0.1.0: seach server for Json |
||||
|
||||
json-search-server [OPTIONS] |
||||
|
||||
Common flags: |
||||
-p --port=3000 Search server port |
||||
-l --logs=apache apache | simple | json | disable | full |
||||
-c --cached Store Json data into memory |
||||
-j -f --json=data.json --file Json file name |
||||
-? --help Display help message |
||||
-V --version Print version information |
||||
--numeric-version Print just the version number |
||||
``` |
||||
|
||||
## Install |
||||
|
||||
``` |
||||
$ make install |
||||
``` |
||||
|
||||
|
||||
# Requests |
||||
|
||||
## Get Server Settings |
||||
|
||||
```bash |
||||
$ curl -sq http://localhost:3000/info |
||||
|
||||
{ |
||||
"cached": false, |
||||
"logs": "full", |
||||
"file": "sample.json", |
||||
"port": 3000 |
||||
} |
||||
``` |
||||
|
||||
## Health Check |
||||
|
||||
```bash |
||||
$ curl -sq http://localhost:3000/health |
||||
|
||||
{ |
||||
"status": "ok", |
||||
"message": "2019-06-19 09:03:26.402385 UTC" |
||||
} |
||||
``` |
||||
|
||||
```bash |
||||
$ chmod a-r sample.json |
||||
$ curl -sq http://localhost:3000/health |
||||
|
||||
{ |
||||
"status": "fail", |
||||
"message": "2019-06-19 09:05:32.950256 UTC sample.json: openFile: permission denied (Permission denied)" |
||||
} |
||||
``` |
||||
|
||||
## Search |
||||
|
||||
```bash |
||||
$ curl -sq http://localhost:3000/search/article |
||||
$ curl -sq http://localhost:3000/search?query=article |
||||
|
||||
[ |
||||
[ |
||||
{ |
||||
"url": "/tags/foo.html", |
||||
"authors": "Author I, Author II", |
||||
"content": "This is article about Foo and Bar.", |
||||
"year": "1990", |
||||
"title": "Page one" |
||||
}, |
||||
1, |
||||
-100 |
||||
] |
||||
] |
||||
``` |
||||
|
||||
## Logs |
||||
|
||||
There are few formats for log messages. It is possible to disable logs completely passing argument `--log disable`. |
||||
|
||||
|
||||
### --log apache (default) |
||||
|
||||
```bash |
||||
127.0.0.1 - - [19/Jun/2019:12:14:16 +0300] "GET /info HTTP/1.1" 200 - "" "curl/7.54.0" |
||||
127.0.0.1 - - [19/Jun/2019:12:14:18 +0300] "GET /health HTTP/1.1" 200 - "" "curl/7.54.0" |
||||
``` |
||||
|
||||
### --log simple |
||||
|
||||
```bash |
||||
GET /health |
||||
Accept: */* |
||||
Status: 200 OK 0.00003s |
||||
GET /info |
||||
Accept: */* |
||||
Status: 200 OK 0.000018s |
||||
``` |
||||
|
||||
### --log json |
||||
|
||||
```json |
||||
{"time":"19/Jun/2019:12:20:06 +0300","response":{"status":200,"size":null,"body":null},"request":{"httpVersion":"1.1","path":"/health","size":0,"body":"","durationMs":7.0e-2,"remoteHost":{"hostAddress":"127.0.0.1","port":63436},"headers":[["Host","localhost:3000"],["User-Agent","curl/7.54.0"],["Accept","*/*"]],"queryString":[],"method":"GET"}} |
||||
{"time":"19/Jun/2019:12:20:07 +0300","response":{"status":200,"size":null,"body":null},"request":{"httpVersion":"1.1","path":"/info","size":0,"body":"","durationMs":4.0e-2,"remoteHost":{"hostAddress":"127.0.0.1","port":63438},"headers":[["Host","localhost:3000"],["User-Agent","curl/7.54.0"],["Accept","*/*"]],"queryString":[],"method":"GET"}} |
||||
``` |
||||
|
||||
Or pretty-printed: |
||||
|
||||
```json |
||||
{ |
||||
"time": "19/Jun/2019:12:20:06 +0300", |
||||
"response": { |
||||
"status": 200, |
||||
"size": null, |
||||
"body": null |
||||
}, |
||||
"request": { |
||||
"httpVersion": "1.1", |
||||
"path": "/health", |
||||
"size": 0, |
||||
"body": "", |
||||
"durationMs": 0.07, |
||||
"remoteHost": { |
||||
"hostAddress": "127.0.0.1", |
||||
"port": 63436 |
||||
}, |
||||
"headers": [ |
||||
[ |
||||
"Host", |
||||
"localhost:3000" |
||||
], |
||||
[ |
||||
"User-Agent", |
||||
"curl/7.54.0" |
||||
], |
||||
[ |
||||
"Accept", |
||||
"*/*" |
||||
] |
||||
], |
||||
"queryString": [], |
||||
"method": "GET" |
||||
} |
||||
} |
||||
``` |
||||
|
||||
|
||||
### --log full |
||||
|
||||
```bash |
||||
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" |
||||
"2019-06-19 09:05:22.226024 UTC" |
||||
"------------------------------------------------------------------------------------------------------------------------" |
||||
"Query: /health" |
||||
"------------------------------------------------------------------------------------------------------------------------" |
||||
Request |
||||
{ requestMethod = "GET" |
||||
, httpVersion = HTTP/1.1 |
||||
, rawPathInfo = "/health" |
||||
, rawQueryString = "" |
||||
, requestHeaders = |
||||
[ |
||||
( "Host" |
||||
, "localhost:3000" |
||||
) |
||||
, |
||||
( "User-Agent" |
||||
, "curl/7.54.0" |
||||
) |
||||
, |
||||
( "Accept" |
||||
, "*/*" |
||||
) |
||||
] |
||||
, isSecure = False |
||||
, remoteHost = 127.0.0.1:63212 |
||||
, pathInfo = [ "health" ] |
||||
, queryString = [] |
||||
, requestBody = <IO ByteString> |
||||
, vault = <Vault> |
||||
, requestBodyLength = KnownLength 0 |
||||
, requestHeaderHost = Just "localhost:3000" |
||||
, requestHeaderRange = Nothing |
||||
} |
||||
``` |
||||
|
||||
## Caching |
||||
|
||||
It is possible to store all data in RAM and use it even if the file is not readable. |
||||
To do this, specify the argument `--cached`. |
||||
|
||||
|
||||
# NGINX |
||||
|
||||
For using this service behind nginx web server might be used following configuration: |
||||
|
||||
```nginx |
||||
upstream search_backend { |
||||
server 127.0.0.1:3000; |
||||
} |
||||
|
||||
server { |
||||
listen 8080; |
||||
server_name search.server www.search.server; |
||||
|
||||
# ... |
||||
|
||||
location / { |
||||
add_header 'Access-Control-Allow-Origin' "$http_origin"; |
||||
add_header 'Access-Control-Allow-Methods' 'GET, POST'; |
||||
add_header 'Access-Control-Allow-Credentials' 'true'; |
||||
add_header 'Access-Control-Allow-Headers' 'User-Agent,Keep-Alive,Content-Type'; |
||||
|
||||
proxy_set_header Host $host; |
||||
proxy_set_header X-Real-IP $remote_addr; |
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; |
||||
proxy_set_header X-Forwarded-Host $remote_addr; |
||||
proxy_pass http://search_backend$uri?$args; |
||||
} |
||||
|
||||
# This prevents intruders from obtaining information |
||||
# about the internal structure of the server. |
||||
location /info { |
||||
proxy_pass http://search_backend/health; |
||||
} |
||||
} |
||||
``` |
||||
|
||||
# Jekyll |
||||
|
||||
Creating a Json file using jekyll will look like this: |
||||
|
||||
```bash |
||||
--- |
||||
layout: none |
||||
search: none |
||||
--- |
||||
|
||||
{% assign all_pages = site.pages | where_exp: 'p', 'p.search != "none"' | where_exp: 'p', 'p.layout != "none"' %} |
||||
|
||||
[ |
||||
|
||||
{% for p in all_pages %} |
||||
{% capture all_authors %}{{ p.authors | join: ',' }}, {{ p.translators | join: ',' }}, {{ p.editors | join: ',' }}{% endcapture %} |
||||
{ |
||||
"title": "{{ p.title | split: '<br />' | join: ' ' | xml_escape }}{% if p.tag %} «{{ p.tag }}» {% endif %}", |
||||
"authors": "{{ p.authors | join: ',' }}", |
||||
"persons": "{{ all_authors }}", |
||||
"content": {{ p.content | strip_html | jsonify }}, |
||||
"tags": "{{ p.tags | join: ', ' }}", |
||||
"year": "{{ p.year }}", |
||||
"url": "{{ p.url | xml_escape }}" |
||||
} |
||||
{% unless forloop.last %},{% endunless %} |
||||
{% endfor %} |
||||
|
||||
] |
||||
``` |
@ -0,0 +1,2 @@
@@ -0,0 +1,2 @@
|
||||
import Distribution.Simple |
||||
main = defaultMain |
Binary file not shown.
@ -0,0 +1,70 @@
@@ -0,0 +1,70 @@
|
||||
{-# LANGUAGE NoImplicitPrelude #-} |
||||
{-# LANGUAGE Unsafe #-} |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- Copyright (C) 2019, Maxim Lihachev, <envrm@yandex.ru> |
||||
|
||||
-- This program is free software: you can redistribute it and/or modify |
||||
-- it under the terms of the GNU General Public License as published by |
||||
-- the Free Software Foundation, either version 3 of the License, or |
||||
-- (at your option) any later version. |
||||
|
||||
-- This program is distributed in the hope that it will be useful, |
||||
-- but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
-- GNU General Public License for more details. |
||||
|
||||
-- You should have received a copy of the GNU General Public License |
||||
-- along with this program. If not, see <http://www.gnu.org/licenses/>. |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Json-search service |
||||
-- |
||||
module Main (main) where |
||||
|
||||
import Relude ( Maybe (Nothing), Bool (False), IO, ($), id, null ) |
||||
|
||||
import System.Exit ( die ) |
||||
import System.Environment ( getArgs, withArgs ) |
||||
import System.Console.CmdArgs ( cmdArgs, help, summary, typ, name, program, (&=) ) |
||||
|
||||
import Text.Pretty.Simple ( pPrint ) |
||||
|
||||
import Server ( startServer, ServerSettings (ServerSettings, file, cached, logs, port), checkJsonFile ) |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Default command line arguments |
||||
-- |
||||
defaultSettings :: IO ServerSettings |
||||
defaultSettings = cmdArgs |
||||
$ ServerSettings { port = 3000 &= typ "3000" &= help "Search server port" |
||||
, logs = "apache" &= typ "apache" &= help "apache | simple | json | disable | full" |
||||
, cached = False &= typ "False" &= help "Store Json data into memory" &= name "cached" |
||||
, file = Nothing &= typ "data.json" &= help "Json file name" &= name "json" |
||||
} &= program "json-search-server" |
||||
&= summary "json-search-server v0.1.0: seach server for Json" |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Start application |
||||
-- |
||||
main :: IO () |
||||
main = do |
||||
args <- getArgs |
||||
settings <- (if null args then withArgs ["--help"] else id) defaultSettings |
||||
|
||||
pPrint settings |
||||
|
||||
let fileStatus = checkJsonFile (file settings) in |
||||
case fileStatus of |
||||
"ok" -> |
||||
startServer settings |
||||
_err -> |
||||
die fileStatus |
||||
|
Binary file not shown.
@ -0,0 +1,70 @@
@@ -0,0 +1,70 @@
|
||||
name: json-search-server |
||||
version: 0.1.0.0 |
||||
# github: "" |
||||
license: GPL-3.0-only |
||||
author: "Maxim Likhachev" |
||||
maintainer: "envrm@yandex.ru" |
||||
copyright: "2019 Maxim Likhachev" |
||||
|
||||
library: |
||||
source-dirs: src |
||||
|
||||
executables: |
||||
json-search-server: |
||||
main: Main.hs |
||||
source-dirs: app |
||||
dependencies: json-search-server |
||||
|
||||
extra-source-files: |
||||
- README.md |
||||
|
||||
dependencies: |
||||
- base |
||||
- relude ^>=0.4.0 |
||||
- cmdargs ^>=0.10.20 |
||||
- aeson ^>=1.4.0.0 |
||||
- edit-distance ^>=0.2.2.1 |
||||
- text ^>=1.2.3.1 |
||||
- time ^>=1.8.0.2 |
||||
- uri-encode ^>=1.5.0.5 |
||||
- scotty ^>=0.11.4 |
||||
- wai ^>=3.2.2 |
||||
- wai-extra ^>=3.0.26 |
||||
- pretty-simple ^>=2.2.0.1 |
||||
- MissingH ^>=1.4.1.0 |
||||
- data-default-class ^>=0.1.2.0 |
||||
|
||||
ghc-options: |
||||
- -fforce-recomp |
||||
- -fobject-code |
||||
- -O2 |
||||
- -optc-O3 |
||||
- -funfolding-use-threshold=16 |
||||
- -threaded |
||||
|
||||
- -Wincomplete-uni-patterns |
||||
- -Wincomplete-record-updates |
||||
- -Wmonomorphism-restriction |
||||
- -Wimplicit-prelude |
||||
- -Wmissing-local-signatures |
||||
- -Wmissing-exported-signatures |
||||
- -Wmissing-export-lists |
||||
- -Wmissing-import-lists |
||||
- -Wmissing-home-modules |
||||
- -Wmissing-methods |
||||
- -Wunused-packages |
||||
- -Wdodgy-imports |
||||
- -Widentities |
||||
- -Wredundant-constraints |
||||
- -Wpartial-fields |
||||
- -Wmissed-specialisations |
||||
- -Wcpp-undef |
||||
- -Wunused-imports |
||||
- -Wunused-matches |
||||
|
||||
- -fno-cse |
||||
- -fhide-source-paths |
||||
- -freverse-errors |
||||
|
||||
# - -Weverything |
||||
|
@ -0,0 +1,29 @@
@@ -0,0 +1,29 @@
|
||||
[ |
||||
{ |
||||
"title": "Page one", |
||||
"authors": "Author I, Author II", |
||||
"persons": "Editor, Translator", |
||||
"content": "This is article about Foo and Bar.", |
||||
"tags": "foo, bar", |
||||
"year": "1990", |
||||
"url": "/pages/foo.html" |
||||
}, |
||||
{ |
||||
"title": "Page two", |
||||
"authors": "Author III", |
||||
"persons": "", |
||||
"content": "Sample page.", |
||||
"tags": "bar, baz", |
||||
"year": "2000", |
||||
"url": "/pages/bar.html" |
||||
}, |
||||
{ |
||||
"title": "Page three", |
||||
"authors": "Author IV", |
||||
"persons": "Translator", |
||||
"content": "", |
||||
"tags": "baz", |
||||
"year": "2010", |
||||
"url": "/pages/baz.html" |
||||
} |
||||
] |
@ -0,0 +1,243 @@
@@ -0,0 +1,243 @@
|
||||
{-# LANGUAGE NoImplicitPrelude #-} |
||||
{-# LANGUAGE DeriveGeneric #-} |
||||
{-# LANGUAGE OverloadedStrings #-} |
||||
{-# LANGUAGE PatternSynonyms #-} |
||||
{-# LANGUAGE ViewPatterns #-} |
||||
{-# LANGUAGE TransformListComp #-} |
||||
{-# LANGUAGE Unsafe #-} |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- Copyright (C) 2019, Maxim Lihachev, <envrm@yandex.ru> |
||||
|
||||
-- This program is free software: you can redistribute it and/or modify |
||||
-- it under the terms of the GNU General Public License as published by |
||||
-- the Free Software Foundation, either version 3 of the License, or |
||||
-- (at your option) any later version. |
||||
|
||||
-- This program is distributed in the hope that it will be useful, |
||||
-- but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
-- GNU General Public License for more details. |
||||
|
||||
-- You should have received a copy of the GNU General Public License |
||||
-- along with this program. If not, see <http://www.gnu.org/licenses/>. |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Fuzzy search for JSON data |
||||
-- |
||||
module Search (Json, tokenize, findJsonObjects) where |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
import Relude ( Maybe (Just, Nothing), Bool, Int, String, Text |
||||
, otherwise, sortWith, toString, toText, div |
||||
, (-), (*), ($), (&), (<$>), (==), (<>), (<), (>), (>=), (&&), (.) ) |
||||
|
||||
import Data.Aeson ( FromJSON, ToJSON, toJSON, object, (.=) ) |
||||
import Data.Char ( toLower, isAlphaNum ) |
||||
import Data.List ( minimum, map, sum, elem, filter, length, nub ) |
||||
import GHC.Generics ( Generic ) |
||||
import Text.EditDistance ( levenshteinDistance, defaultEditCosts ) |
||||
|
||||
import qualified Data.Text as T ( map, length, take, toLower, strip, words ) |
||||
import qualified Network.URI.Encode as URI ( decodeText ) |
||||
|
||||
import Settings ( maxContentLength, minPatternLength |
||||
, outOfScore, fullScore, scoreMultiplier, replaceChar ) |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
pattern T :: String -> Text |
||||
pattern T str <- (toString -> str) |
||||
where T = toText |
||||
|
||||
data FindStrictness = Strict -- ^ Exact match |
||||
| Fuzzy -- ^ Fuzzy search |
||||
|
||||
-- | Структура 'Score' |
||||
data Score = Score { matches :: !Int -- ^ Count of matches |
||||
, diffs :: !Int -- ^ Count of differences (errors) or 'fullScore' |
||||
} |
||||
|
||||
-- |
||||
-- | Json format: |
||||
-- |
||||
-- [ |
||||
-- { |
||||
-- "title": "Page one", |
||||
-- "authors": "Author I, Author II", |
||||
-- "persons": "Editor, Translator", |
||||
-- "content": "This is article about Foo and Bar.", |
||||
-- "tags": "foo, bar", |
||||
-- "year": "1990", |
||||
-- "url": "/pages/foo.html" |
||||
-- }, |
||||
-- {...} |
||||
-- ] |
||||
-- |
||||
data Json = Json { title :: !Text |
||||
, authors :: !Text |
||||
, persons :: !Text |
||||
, content :: !Text |
||||
, tags :: !Text |
||||
, year :: !Text |
||||
, url :: !Text } deriving (Generic) |
||||
|
||||
instance FromJSON Json |
||||
|
||||
-- |
||||
-- | Json output preprocessing |
||||
-- |
||||
-- * URLs are decoded. |
||||
-- * "content" is shortened. |
||||
-- |
||||
instance ToJSON Json where |
||||
toJSON jsonObject = |
||||
object [ "title" .= title jsonObject |
||||
, "authors" .= authors jsonObject |
||||
, "year" .= year jsonObject |
||||
, "url" .= (url jsonObject & URI.decodeText) |
||||
, "content" .= (content jsonObject & shortenWith "..." maxContentLength) ] |
||||
-- "tags" .= (tags jsonObject & shortenWith "..." maxContentLength) ] |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Shorten text to specified length and append a given suffix. |
||||
-- |
||||
-- >>> shortenWith "" 8 "1234567890" |
||||
-- "12345678" |
||||
-- |
||||
-- >>> shortenWith "..." 8 "1234567890" |
||||
-- "12345..." |
||||
-- |
||||
shortenWith :: Text -> Int -> Text -> Text |
||||
shortenWith suffix n string | T.length string >= n = T.strip (T.take (n - T.length suffix) string) <> suffix |
||||
| otherwise = string |
||||
|
||||
-- |
||||
-- | |
||||
-- |
||||
valid :: Score -> Bool |
||||
valid (Score x y) = x > 0 && y < 0 |
||||
|
||||
-- | |
||||
sumall :: (a -> Int) -> [a] -> Int |
||||
sumall fn = sum . map fn |
||||
|
||||
-- |
||||
-- | Calculation of relevance: |
||||
-- |
||||
-- * For each complete match with the template (score == 0), |
||||
-- 'fullScore' points are given (100 by default), |
||||
-- the remaining points are added together. |
||||
-- |
||||
-- * Result with a minus is used for |
||||
-- correct sorting by decreasing accuracy of results. |
||||
-- |
||||
-- >>> getScore [Score { matches = 1, diffs = 0 }, Score { matches = 2, diffs = 1 }] |
||||
-- "-1020" |
||||
-- |
||||
getScore :: [Score] -> Int |
||||
getScore = sumall scores |
||||
where scores (Score x 0) = fullScore * x |
||||
scores (Score x s) = -x * scoreMultiplier `div` s |
||||
|
||||
-- |
||||
-- | Splitting text into separate words. |
||||
-- |
||||
-- * Keeping only alpha-num characters. |
||||
-- * Deleting words shorter than 'minPatternLength'. |
||||
-- |
||||
-- >>> tokenize "an alpha, a beta: a gam-ma; and a (delta)" |
||||
-- ["alpha", "beta", "gam", "ma", "delta"] |
||||
-- |
||||
tokenize :: Text -> [Text] |
||||
tokenize xs = filter (\w -> T.length w >= minPatternLength) wds |
||||
where wds = T.words $ T.map isChar xs |
||||
isChar ch | isAlphaNum ch = replaceChar . toLower $ ch |
||||
| otherwise = ' ' |
||||
|
||||
-- | [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) computing |
||||
levenshtein :: Text -> Text -> Int |
||||
levenshtein str txt = levenshteinDistance defaultEditCosts (toString str) (toString txt) |
||||
|
||||
-- |
||||
-- | Comparing a word with all the words of a text |
||||
-- and returning the 'Score'. |
||||
-- |
||||
findWord :: [Text] -> Text -> Score |
||||
findWord txt word | word `elem` txt = Score { matches = 1, diffs = 0 } |
||||
| otherwise = Score { matches = lns, diffs = ms } |
||||
where ls = [ l | wd <- txt, let l = levenshtein word wd, l < outOfScore ] |
||||
lns = length ls |
||||
ms | lns > 0 = minimum ls |
||||
| otherwise = outOfScore |
||||
|
||||
-- |
||||
-- | Evaluation of the sameness of two strings. |
||||
-- |
||||
-- >>> locate "string" "string" Strict |
||||
-- Score 1 -1000 -- exact match without any errors |
||||
-- |
||||
-- >>> locate "string" "strings" Fuzzy |
||||
-- Score 1 -10 -- match without one difference |
||||
-- |
||||
-- >>> locate "string" "text" Fuzzy |
||||
-- Score 0 outOfScore -- string not found |
||||
-- |
||||
locate :: Text -> Text -> FindStrictness -> Score |
||||
-- Empty string or empty text |
||||
locate (T[]) _ _ = Score { matches = 0, diffs = outOfScore } |
||||
locate _ (T[]) _ = Score { matches = 0, diffs = outOfScore } |
||||
-- Strict search |
||||
locate str txt Strict | T.toLower str == T.toLower txt = Score { matches = 1, diffs = fullScore } |
||||
| otherwise = Score { matches = 0, diffs = outOfScore } |
||||
-- Fuzzy search |
||||
locate str txt Fuzzy = Score { matches = sumall matches trs, diffs = getScore trs } |
||||
where trs = findWord wds <$> sxs |
||||
wds = tokenize txt |
||||
sxs = nub $ tokenize str |
||||
|
||||
-- |
||||
-- | Alias for 'collision'. |
||||
-- |
||||
-- >>> "string" ==~ "Text with string" |
||||
-- Score 1 fullScore |
||||
-- |
||||
(==~) = locate |
||||
|
||||
-- |
||||
-- | Compare all file fields with a string. |
||||
-- |
||||
anyMatch :: Text -> Json -> Score |
||||
anyMatch string jsonObject = Score { matches = length l, diffs = sumall diffs l } |
||||
where l = filter valid [ string ==~ fullname $ Fuzzy, |
||||
string ==~ content jsonObject $ Fuzzy, |
||||
string ==~ year jsonObject $ Strict, |
||||
string ==~ tags jsonObject $ Fuzzy |
||||
] |
||||
fullname = persons jsonObject <> " " <> title jsonObject |
||||
-- |
||||
-- | Alias for 'anyMatch'. |
||||
-- |
||||
-- >>> "string" `within` Json |
||||
-- |
||||
within = anyMatch |
||||
|
||||
-- |
||||
-- | Search for Json objects which include passing string. |
||||
-- |
||||
findJsonObjects :: Maybe [Json] -> Text -> [(Json, Int, Int)] |
||||
findJsonObjects Nothing _ = [] |
||||
findJsonObjects (Just objs) string | T.length string < minPatternLength = [] |
||||
| otherwise = foundPages |
||||
where foundPages = [ (obj, mcs, dfs) | obj <- objs, |
||||
let item = string `within` obj, valid item, |
||||
let mcs = matches item, |
||||
let dfs = diffs item, |
||||
then sortWith by dfs] |
||||
|
@ -0,0 +1,201 @@
@@ -0,0 +1,201 @@
|
||||
{-# LANGUAGE NoImplicitPrelude #-} |
||||
{-# LANGUAGE DeriveAnyClass #-} |
||||
{-# LANGUAGE DeriveGeneric #-} |
||||
{-# LANGUAGE DeriveDataTypeable #-} |
||||
{-# LANGUAGE OverloadedStrings #-} |
||||
{-# LANGUAGE ScopedTypeVariables #-} |
||||
{-# LANGUAGE Unsafe #-} |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- Copyright (C) 2019, Maxim Lihachev, <envrm@yandex.ru> |
||||
|
||||
-- This program is free software: you can redistribute it and/or modify |
||||
-- it under the terms of the GNU General Public License as published by |
||||
-- the Free Software Foundation, either version 3 of the License, or |
||||
-- (at your option) any later version. |
||||
|
||||
-- This program is distributed in the hope that it will be useful, |
||||
-- but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
-- GNU General Public License for more details. |
||||
|
||||
-- You should have received a copy of the GNU General Public License |
||||
-- along with this program. If not, see <http://www.gnu.org/licenses/>. |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Scotty-based web service which provides API for "Search" module. |
||||
-- |
||||
module Server (startServer, ServerSettings (..), checkHealth, checkJsonFile) where |
||||
|
||||
|
||||
import Relude ( Maybe (Just, Nothing), String, Text, Int, Bool, IO, Show |
||||
, ($), (.), (>>), (=<<), (>>=), (<>) |
||||
, otherwise, liftIO, show, return, replicate ) |
||||
|
||||
import Data.Aeson ( ToJSON, decodeFileStrict' ) |
||||
import Web.Scotty ( ActionM, scotty, get, json, notFound, param, middleware, redirect ) |
||||
import Network.Wai ( Middleware, Application, Request, Response, ResponseReceived, rawQueryString, rawPathInfo ) |
||||
import Network.Wai.Middleware.RequestLogger.JSON ( formatAsJSON ) |
||||
import Network.Wai.Middleware.RequestLogger ( logStdout, logStdoutDev, mkRequestLogger, outputFormat |
||||
, OutputFormat (CustomOutputFormat, CustomOutputFormatWithDetails) ) |
||||
|
||||
import Text.Pretty.Simple ( pPrint ) |
||||
import Foreign.Marshal.Unsafe ( unsafeLocalState ) |
||||
import GHC.Generics ( Generic ) |
||||
import Data.Default.Class ( Default (def) ) |
||||
|
||||
import System.Console.CmdArgs ( Data ) |
||||
import System.IO ( openFile, hClose, IOMode (ReadMode) ) |
||||
import Control.Exception ( handle, IOException ) |
||||
import Control.Monad ( join ) |
||||
import Data.Time ( getCurrentTime, UTCTime ) |
||||
import Data.Text.Encoding ( decodeUtf8 ) |
||||
import Network.URI.Encode ( decodeText ) |
||||
|
||||
import Search ( findJsonObjects, Json ) |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Server settings' format. |
||||
-- |
||||
data ServerSettings = ServerSettings { port :: Int |
||||
, logs :: String |
||||
, cached :: Bool |
||||
, file :: Maybe String |
||||
} deriving (Show, Data, Generic, ToJSON) |
||||
|
||||
-- |
||||
-- | Healthcheck's format. |
||||
-- |
||||
data Health = Health { status :: !String |
||||
, message :: !String } deriving (Show, Generic, ToJSON) |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | If 'filename' is readable then "ok" or error message in other case. |
||||
-- |
||||
checkJsonFile :: Maybe String -> String |
||||
checkJsonFile Nothing = "ERROR: Data file is not specified" |
||||
checkJsonFile (Just filename) = unsafeLocalState |
||||
$ handle (\ (e :: IOException) -> return $ show e) |
||||
$ openFile filename ReadMode >>= hClose >> return "ok" |
||||
|
||||
-- |
||||
-- | Service checking. |
||||
-- |
||||
-- If file is exists and readable return "ok". |
||||
-- |
||||
-- >>> curl json-search-server:port/health |
||||
-- { |
||||
-- "status":"ok", |
||||
-- "message":"" |
||||
-- } |
||||
-- |
||||
-- Otherwise return "warning" (if cache is used) or "fail" status with error message. |
||||
-- |
||||
-- >>> curl json-search-server:port/health |
||||
-- { |
||||
-- "status":"fail", |
||||
-- "message":"broken.json: openFile: does not exist (No such file or directory)" |
||||
-- } |
||||
-- |
||||
checkHealth :: UTCTime -> Bool -> Maybe String -> IO (ActionM ()) |
||||
checkHealth serverStartTime withCache filename = |
||||
let currentStatus = checkJsonFile filename in do |
||||
time <- getCurrentTime |
||||
|
||||
return $ json $ case currentStatus of |
||||
"ok" -> |
||||
if withCache |
||||
then Health "ok" ("cache was loaded " <> show serverStartTime) |
||||
else Health "ok" (show time) |
||||
_err -> |
||||
if withCache |
||||
then Health ("warning: using cache from " <> show serverStartTime) currentStatus |
||||
else Health "fail" (show time <> " " <> currentStatus) |
||||
|
||||
-- |
||||
-- | Read JSON data from file. |
||||
-- |
||||
-- NOINLINE pragma uses to force read file each function call. |
||||
-- |
||||
loadJSON :: Maybe String -> Maybe [Json] |
||||
loadJSON Nothing = Nothing |
||||
loadJSON (Just filename) = unsafeLocalState $ decodeFileStrict' filename |
||||
{-# NOINLINE loadJSON #-} |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Run a search with or without cache. |
||||
-- |
||||
-- If argument --cached was passed, it uses the stored data. |
||||
-- In another case, re-reads the data from the file. |
||||
-- |
||||
findWithCache :: Maybe [Json] -> Maybe String -> Text -> [(Json, Int, Int)] |
||||
findWithCache Nothing filename string = findJsonObjects (loadJSON filename) string |
||||
findWithCache cache _ string = findJsonObjects cache string |
||||
|
||||
-- |
||||
-- | Format or omit all Middleware output. |
||||
-- |
||||
formatLog :: String -> IO Middleware |
||||
formatLog "json" = mkRequestLogger def { outputFormat = CustomOutputFormatWithDetails formatAsJSON } |
||||
formatLog _none = mkRequestLogger def { outputFormat = CustomOutputFormat (\_ _ _ _ -> "") } |
||||
|
||||
-- |
||||
-- | Log types. |
||||
-- |
||||
logRequest :: String -> Application -> Request -> (Response -> IO ResponseReceived) -> IO ResponseReceived |
||||
logRequest format app req respond = |
||||
let ret f = f app req respond |
||||
header = replicate 120 |
||||
in |
||||
case format of |
||||
"full" -> do |
||||
pPrint $ header '~' |
||||
pPrint . show =<< getCurrentTime |
||||
pPrint $ header '-' |
||||
pPrint $ "Query: " |
||||
<> (decodeText . decodeUtf8 $ rawPathInfo req) |
||||
<> (decodeText . decodeUtf8 $ rawQueryString req) |
||||
pPrint $ header '-' |
||||
pPrint req |
||||
|
||||
ret =<< formatLog "none" |
||||
|
||||
"disable" -> ret =<< formatLog "none" |
||||
|
||||
"json" -> ret =<< formatLog "json" |
||||
|
||||
"simple" -> ret logStdoutDev |
||||
|
||||
_apache -> ret logStdout |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Start Scotty server with passing 'ServerSettings'. |
||||
-- |
||||
startServer :: ServerSettings -> IO () |
||||
startServer settings@(ServerSettings serverPort logFormat withCache filename) = |
||||
let cache | withCache = loadJSON filename |
||||
| otherwise = Nothing |
||||
in do |
||||
serverStartTime <- getCurrentTime |
||||
|
||||
scotty serverPort $ do |
||||
middleware $ logRequest logFormat |
||||
|
||||
get "/info" $ json settings |
||||
get "/health" $ join $ liftIO $ checkHealth serverStartTime withCache filename |
||||
get "/search" $ json . findWithCache cache filename =<< param "query" |
||||
get "/search/:query" $ json . findWithCache cache filename =<< param "query" |
||||
|
||||
notFound $ redirect "/info" |
||||
|
@ -0,0 +1,66 @@
@@ -0,0 +1,66 @@
|
||||
{-# LANGUAGE NoImplicitPrelude #-} |
||||
{-# LANGUAGE Safe #-} |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- Copyright (C) 2019, Maxim Lihachev, <envrm@yandex.ru> |
||||
|
||||
-- This program is free software: you can redistribute it and/or modify |
||||
-- it under the terms of the GNU General Public License as published by |
||||
-- the Free Software Foundation, either version 3 of the License, or |
||||
-- (at your option) any later version. |
||||
|
||||
-- This program is distributed in the hope that it will be useful, |
||||
-- but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
-- GNU General Public License for more details. |
||||
|
||||
-- You should have received a copy of the GNU General Public License |
||||
-- along with this program. If not, see <http://www.gnu.org/licenses/>. |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- |
||||
-- | Fuzzy search settings. |
||||
-- |
||||
module Settings ( maxContentLength |
||||
, minPatternLength |
||||
, scoreMultiplier |
||||
, outOfScore |
||||
, fullScore |
||||
, replaceChar ) |
||||
where |
||||
|
||||
import Relude ( Int, Char, (+), (*) ) |
||||
|
||||
-- ----------------------------------------------------------------------------- |
||||
|
||||
-- | Maximum length of strings in resulting json. |
||||
maxContentLength :: Int |
||||
maxContentLength = 160 |
||||
|
||||
-- | Minimum length of search pattern. |
||||
minPatternLength :: Int |
||||
minPatternLength = 4 |
||||
|
||||
-- | Maximum count of differencies between the search words and the string. |
||||
maxVariation :: Int |
||||
maxVariation = 2 |
||||
|
||||
-- | Awarded for insufficient string matching. |
||||
outOfScore :: Int |
||||
outOfScore = maxVariation + 1 |
||||
|
||||
-- | Awarded with the full concurrence with the string. |
||||
fullScore :: Int |
||||
fullScore = -10 * scoreMultiplier |
||||
|
||||
-- | The larger the number, the more accurate the sorting by relevance. |
||||
scoreMultiplier :: Int |
||||
scoreMultiplier = 10 |
||||
|
||||
-- | List of characters to replace during text preprocessing. |
||||
replaceChar :: Char -> Char |
||||
replaceChar 'ё' = 'е' |
||||
replaceChar ch = ch |
||||
|
@ -0,0 +1,64 @@
@@ -0,0 +1,64 @@
|
||||
# This file was automatically generated by 'stack init' |
||||
# |
||||
# Some commonly used options have been documented as comments in this file. |
||||
# For advanced use and comprehensive documentation of the format, please see: |
||||
# https://docs.haskellstack.org/en/stable/yaml_configuration/ |
||||
|
||||
# Resolver to choose a 'specific' stackage snapshot or a compiler version. |
||||
# A snapshot resolver dictates the compiler version and the set of packages |
||||
# to be used for project dependencies. For example: |
||||
# |
||||
# resolver: lts-3.5 |
||||
# resolver: nightly-2015-09-21 |
||||
# resolver: ghc-7.10.2 |
||||
# |
||||
# The location of a snapshot can be provided as a file or url. Stack assumes |
||||
# a snapshot provided as a file might change, whereas a url resource does not. |
||||
# |
||||
# resolver: ./custom-snapshot.yaml |
||||
# resolver: https://example.com/snapshots/2018-01-01.yaml |
||||
resolver: lts-13.23 |
||||
|
||||
# User packages to be built. |
||||
# Various formats can be used as shown in the example below. |
||||
# |
||||
# packages: |
||||
# - some-directory |
||||
# - https://example.com/foo/bar/baz-0.0.2.tar.gz |
||||
# - location: |
||||
# git: https://github.com/commercialhaskell/stack.git |
||||
# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a |
||||
# - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a |
||||
# subdirs: |
||||
# - auto-update |
||||
# - wai |
||||
packages: |
||||
- . |
||||
# Dependency packages to be pulled from upstream that are not in the resolver |
||||
# using the same syntax as the packages field. |
||||
# (e.g., acme-missiles-0.3) |
||||
# extra-deps: [] |
||||
|
||||
# Override default flag values for local packages and extra-deps |
||||
# flags: {} |
||||
|
||||
# Extra package databases containing global packages |
||||
# extra-package-dbs: [] |
||||
|
||||
# Control whether we use the GHC we find on the path |
||||
# system-ghc: true |
||||
# |
||||
# Require a specific version of stack, using version ranges |
||||
# require-stack-version: -any # Default |
||||
# require-stack-version: ">=1.9" |
||||
# |
||||
# Override the architecture used by stack, especially useful on Windows |
||||
# arch: i386 |
||||
# arch: x86_64 |
||||
# |
||||
# Extra directories used by stack for building |
||||
# extra-include-dirs: [/path/to/dir] |
||||
# extra-lib-dirs: [/path/to/dir] |
||||
# |
||||
# Allow a newer minor version of GHC than the snapshot specifies |
||||
# compiler-check: newer-minor |
Loading…
Reference in new issue