commit 0b55f3cc4385d9ea05c7fbea780c7367b8f80593 Author: Maxim Likhachev Date: Wed Jun 19 20:00:41 2019 +0300 Search server for json files diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f979be9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +.stack-work/ +json-search-server.cabal + +*~ +dist +dist-* +cabal-dev +*.o +*.hi +*.chi +*.chs.h +*.dyn_o +*.dyn_hi +.hpc +.hsenv +.cabal-sandbox/ +cabal.sandbox.config +*.prof +*.aux +*.hp +*.eventlog +cabal.project.local +.HTF/ +.ghc.environment.* + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..01faa15 --- /dev/null +++ b/Makefile @@ -0,0 +1,38 @@ +application = json-search-server +stack = stack +hlint = hlint + +all: app docs + +build: app + +deps: + $(stack) build --only-dependencies + +app: + $(stack) build + +install: + $(stack) install + +docs: + $(stack) exec -- haddock --html --pretty-html src/*.hs app/Main.hs --hyperlinked-source --optghc=-DHADDOCK --odir=docs --show-all --use-unicode + +lint: hlint +hlint: + $(hlint) .; : + +check: + $(stack) build --force-dirty --ghc-options="-Weverything" + +static: + $(stack) build --force-dirty --ghc-options="-optl-static -optc-static" + +exec: + $(stack) exec -- $(application) $(filter-out $@, $(MAKECMDGOALS)) + +%: + @true + +.PHONY: build app docs all hlint static exec check install + diff --git a/README.md b/README.md new file mode 100644 index 0000000..175895f --- /dev/null +++ b/README.md @@ -0,0 +1,326 @@ +# Json Search Server + +**v0.1.0** + + +This application provides fuzzy search server for data stored in JSON format. + + +The purpose of the development of this application was to use it with the website written on [Jekyll](https://jekyllrb.com/). + +--- + +## License +``` +Copyright (C) 2019, Maxim Lihachev, + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +``` + +# Algorithm + +The logic of this application is pretty simple: +* All fields in the file are divided into separate words. +* Each unique word in the search string is compared to the words in the file. +* If the match is exact, the highest score is awarded. +* If there is an inaccurate coincidence, the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) is calculated and the word score is formed on its basis. +* Results are sorted in descending order of accuracy. + + +# How to use + +## Build + +``` +$ make build +or +$ make static +``` + +There is [Vagrant](https://www.vagrantup.com/) file configured for starting Centos 7. This VM is used for making statically-linked binary file for target server. + +``` +host:~$ vagrant up +host:~$ vagrant ssh + +vm:~$ cd /vagrant +vm:~$ make static +``` + +Static executable file is in bin/ directory. + +## Run +``` +$ make exec + +json-search-server v0.1.0: seach server for Json + +json-search-server [OPTIONS] + +Common flags: + -p --port=3000 Search server port + -l --logs=apache apache | simple | json | disable | full + -c --cached Store Json data into memory + -j -f --json=data.json --file Json file name + -? --help Display help message + -V --version Print version information + --numeric-version Print just the version number +``` + +## Install + +``` +$ make install +``` + + +# Requests + +## Get Server Settings + +```bash +$ curl -sq http://localhost:3000/info + +{ + "cached": false, + "logs": "full", + "file": "sample.json", + "port": 3000 +} +``` + +## Health Check + +```bash +$ curl -sq http://localhost:3000/health + +{ + "status": "ok", + "message": "2019-06-19 09:03:26.402385 UTC" +} +``` + +```bash +$ chmod a-r sample.json +$ curl -sq http://localhost:3000/health + +{ + "status": "fail", + "message": "2019-06-19 09:05:32.950256 UTC sample.json: openFile: permission denied (Permission denied)" +} +``` + +## Search + +```bash +$ curl -sq http://localhost:3000/search/article +$ curl -sq http://localhost:3000/search?query=article + +[ + [ + { + "url": "/tags/foo.html", + "authors": "Author I, Author II", + "content": "This is article about Foo and Bar.", + "year": "1990", + "title": "Page one" + }, + 1, + -100 + ] +] +``` + +## Logs + +There are few formats for log messages. It is possible to disable logs completely passing argument `--log disable`. + + +### --log apache (default) + +```bash +127.0.0.1 - - [19/Jun/2019:12:14:16 +0300] "GET /info HTTP/1.1" 200 - "" "curl/7.54.0" +127.0.0.1 - - [19/Jun/2019:12:14:18 +0300] "GET /health HTTP/1.1" 200 - "" "curl/7.54.0" +``` + +### --log simple + +```bash +GET /health + Accept: */* + Status: 200 OK 0.00003s +GET /info + Accept: */* + Status: 200 OK 0.000018s +``` + +### --log json + +```json +{"time":"19/Jun/2019:12:20:06 +0300","response":{"status":200,"size":null,"body":null},"request":{"httpVersion":"1.1","path":"/health","size":0,"body":"","durationMs":7.0e-2,"remoteHost":{"hostAddress":"127.0.0.1","port":63436},"headers":[["Host","localhost:3000"],["User-Agent","curl/7.54.0"],["Accept","*/*"]],"queryString":[],"method":"GET"}} +{"time":"19/Jun/2019:12:20:07 +0300","response":{"status":200,"size":null,"body":null},"request":{"httpVersion":"1.1","path":"/info","size":0,"body":"","durationMs":4.0e-2,"remoteHost":{"hostAddress":"127.0.0.1","port":63438},"headers":[["Host","localhost:3000"],["User-Agent","curl/7.54.0"],["Accept","*/*"]],"queryString":[],"method":"GET"}} +``` + +Or pretty-printed: + +```json +{ + "time": "19/Jun/2019:12:20:06 +0300", + "response": { + "status": 200, + "size": null, + "body": null + }, + "request": { + "httpVersion": "1.1", + "path": "/health", + "size": 0, + "body": "", + "durationMs": 0.07, + "remoteHost": { + "hostAddress": "127.0.0.1", + "port": 63436 + }, + "headers": [ + [ + "Host", + "localhost:3000" + ], + [ + "User-Agent", + "curl/7.54.0" + ], + [ + "Accept", + "*/*" + ] + ], + "queryString": [], + "method": "GET" + } +} +``` + + +### --log full + +```bash +"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +"2019-06-19 09:05:22.226024 UTC" +"------------------------------------------------------------------------------------------------------------------------" +"Query: /health" +"------------------------------------------------------------------------------------------------------------------------" +Request + { requestMethod = "GET" + , httpVersion = HTTP/1.1 + , rawPathInfo = "/health" + , rawQueryString = "" + , requestHeaders = + [ + ( "Host" + , "localhost:3000" + ) + , + ( "User-Agent" + , "curl/7.54.0" + ) + , + ( "Accept" + , "*/*" + ) + ] + , isSecure = False + , remoteHost = 127.0.0.1:63212 + , pathInfo = [ "health" ] + , queryString = [] + , requestBody = + , vault = + , requestBodyLength = KnownLength 0 + , requestHeaderHost = Just "localhost:3000" + , requestHeaderRange = Nothing + } +``` + +## Caching + +It is possible to store all data in RAM and use it even if the file is not readable. +To do this, specify the argument `--cached`. + + +# NGINX + +For using this service behind nginx web server might be used following configuration: + +```nginx +upstream search_backend { + server 127.0.0.1:3000; +} + +server { + listen 8080; + server_name search.server www.search.server; + + # ... + + location / { + add_header 'Access-Control-Allow-Origin' "$http_origin"; + add_header 'Access-Control-Allow-Methods' 'GET, POST'; + add_header 'Access-Control-Allow-Credentials' 'true'; + add_header 'Access-Control-Allow-Headers' 'User-Agent,Keep-Alive,Content-Type'; + + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Host $remote_addr; + proxy_pass http://search_backend$uri?$args; + } + + # This prevents intruders from obtaining information + # about the internal structure of the server. + location /info { + proxy_pass http://search_backend/health; + } +} +``` + +# Jekyll + +Creating a Json file using jekyll will look like this: + +```bash +--- +layout: none +search: none +--- + +{% assign all_pages = site.pages | where_exp: 'p', 'p.search != "none"' | where_exp: 'p', 'p.layout != "none"' %} + +[ + + {% for p in all_pages %} + {% capture all_authors %}{{ p.authors | join: ',' }}, {{ p.translators | join: ',' }}, {{ p.editors | join: ',' }}{% endcapture %} + { + "title": "{{ p.title | split: '
' | join: ' ' | xml_escape }}{% if p.tag %} «{{ p.tag }}» {% endif %}", + "authors": "{{ p.authors | join: ',' }}", + "persons": "{{ all_authors }}", + "content": {{ p.content | strip_html | jsonify }}, + "tags": "{{ p.tags | join: ', ' }}", + "year": "{{ p.year }}", + "url": "{{ p.url | xml_escape }}" + } + {% unless forloop.last %},{% endunless %} + {% endfor %} + +] +``` diff --git a/Setup.hs b/Setup.hs new file mode 100644 index 0000000..9a994af --- /dev/null +++ b/Setup.hs @@ -0,0 +1,2 @@ +import Distribution.Simple +main = defaultMain diff --git a/app/Main b/app/Main new file mode 100755 index 0000000..da0c2cd Binary files /dev/null and b/app/Main differ diff --git a/app/Main.hs b/app/Main.hs new file mode 100644 index 0000000..3009ce4 --- /dev/null +++ b/app/Main.hs @@ -0,0 +1,70 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE Unsafe #-} + +-- ----------------------------------------------------------------------------- + +-- Copyright (C) 2019, Maxim Lihachev, + +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU General Public License as published by +-- the Free Software Foundation, either version 3 of the License, or +-- (at your option) any later version. + +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU General Public License for more details. + +-- You should have received a copy of the GNU General Public License +-- along with this program. If not, see . + +-- ----------------------------------------------------------------------------- + +-- +-- | Json-search service +-- +module Main (main) where + +import Relude ( Maybe (Nothing), Bool (False), IO, ($), id, null ) + +import System.Exit ( die ) +import System.Environment ( getArgs, withArgs ) +import System.Console.CmdArgs ( cmdArgs, help, summary, typ, name, program, (&=) ) + +import Text.Pretty.Simple ( pPrint ) + +import Server ( startServer, ServerSettings (ServerSettings, file, cached, logs, port), checkJsonFile ) + +-- ----------------------------------------------------------------------------- + +-- +-- | Default command line arguments +-- +defaultSettings :: IO ServerSettings +defaultSettings = cmdArgs + $ ServerSettings { port = 3000 &= typ "3000" &= help "Search server port" + , logs = "apache" &= typ "apache" &= help "apache | simple | json | disable | full" + , cached = False &= typ "False" &= help "Store Json data into memory" &= name "cached" + , file = Nothing &= typ "data.json" &= help "Json file name" &= name "json" + } &= program "json-search-server" + &= summary "json-search-server v0.1.0: seach server for Json" + +-- ----------------------------------------------------------------------------- + +-- +-- | Start application +-- +main :: IO () +main = do + args <- getArgs + settings <- (if null args then withArgs ["--help"] else id) defaultSettings + + pPrint settings + + let fileStatus = checkJsonFile (file settings) in + case fileStatus of + "ok" -> + startServer settings + _err -> + die fileStatus + diff --git a/bin/json-search.x86_64.centos7 b/bin/json-search.x86_64.centos7 new file mode 100755 index 0000000..265e8fb Binary files /dev/null and b/bin/json-search.x86_64.centos7 differ diff --git a/package.yaml b/package.yaml new file mode 100644 index 0000000..012d30e --- /dev/null +++ b/package.yaml @@ -0,0 +1,70 @@ +name: json-search-server +version: 0.1.0.0 +# github: "" +license: GPL-3.0-only +author: "Maxim Likhachev" +maintainer: "envrm@yandex.ru" +copyright: "2019 Maxim Likhachev" + +library: + source-dirs: src + +executables: + json-search-server: + main: Main.hs + source-dirs: app + dependencies: json-search-server + +extra-source-files: +- README.md + +dependencies: +- base +- relude ^>=0.4.0 +- cmdargs ^>=0.10.20 +- aeson ^>=1.4.0.0 +- edit-distance ^>=0.2.2.1 +- text ^>=1.2.3.1 +- time ^>=1.8.0.2 +- uri-encode ^>=1.5.0.5 +- scotty ^>=0.11.4 +- wai ^>=3.2.2 +- wai-extra ^>=3.0.26 +- pretty-simple ^>=2.2.0.1 +- MissingH ^>=1.4.1.0 +- data-default-class ^>=0.1.2.0 + +ghc-options: + - -fforce-recomp + - -fobject-code + - -O2 + - -optc-O3 + - -funfolding-use-threshold=16 + - -threaded + + - -Wincomplete-uni-patterns + - -Wincomplete-record-updates + - -Wmonomorphism-restriction + - -Wimplicit-prelude + - -Wmissing-local-signatures + - -Wmissing-exported-signatures + - -Wmissing-export-lists + - -Wmissing-import-lists + - -Wmissing-home-modules + - -Wmissing-methods + - -Wunused-packages + - -Wdodgy-imports + - -Widentities + - -Wredundant-constraints + - -Wpartial-fields + - -Wmissed-specialisations + - -Wcpp-undef + - -Wunused-imports + - -Wunused-matches + + - -fno-cse + - -fhide-source-paths + - -freverse-errors + + # - -Weverything + diff --git a/sample.json b/sample.json new file mode 100644 index 0000000..254e365 --- /dev/null +++ b/sample.json @@ -0,0 +1,29 @@ +[ + { + "title": "Page one", + "authors": "Author I, Author II", + "persons": "Editor, Translator", + "content": "This is article about Foo and Bar.", + "tags": "foo, bar", + "year": "1990", + "url": "/pages/foo.html" + }, + { + "title": "Page two", + "authors": "Author III", + "persons": "", + "content": "Sample page.", + "tags": "bar, baz", + "year": "2000", + "url": "/pages/bar.html" + }, + { + "title": "Page three", + "authors": "Author IV", + "persons": "Translator", + "content": "", + "tags": "baz", + "year": "2010", + "url": "/pages/baz.html" + } +] diff --git a/src/Search.hs b/src/Search.hs new file mode 100644 index 0000000..5b4b778 --- /dev/null +++ b/src/Search.hs @@ -0,0 +1,243 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE TransformListComp #-} +{-# LANGUAGE Unsafe #-} + +-- ----------------------------------------------------------------------------- + +-- Copyright (C) 2019, Maxim Lihachev, + +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU General Public License as published by +-- the Free Software Foundation, either version 3 of the License, or +-- (at your option) any later version. + +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU General Public License for more details. + +-- You should have received a copy of the GNU General Public License +-- along with this program. If not, see . + +-- ----------------------------------------------------------------------------- + +-- +-- | Fuzzy search for JSON data +-- +module Search (Json, tokenize, findJsonObjects) where + +-- ----------------------------------------------------------------------------- + +import Relude ( Maybe (Just, Nothing), Bool, Int, String, Text + , otherwise, sortWith, toString, toText, div + , (-), (*), ($), (&), (<$>), (==), (<>), (<), (>), (>=), (&&), (.) ) + +import Data.Aeson ( FromJSON, ToJSON, toJSON, object, (.=) ) +import Data.Char ( toLower, isAlphaNum ) +import Data.List ( minimum, map, sum, elem, filter, length, nub ) +import GHC.Generics ( Generic ) +import Text.EditDistance ( levenshteinDistance, defaultEditCosts ) + +import qualified Data.Text as T ( map, length, take, toLower, strip, words ) +import qualified Network.URI.Encode as URI ( decodeText ) + +import Settings ( maxContentLength, minPatternLength + , outOfScore, fullScore, scoreMultiplier, replaceChar ) + +-- ----------------------------------------------------------------------------- + +pattern T :: String -> Text +pattern T str <- (toString -> str) + where T = toText + +data FindStrictness = Strict -- ^ Exact match + | Fuzzy -- ^ Fuzzy search + +-- | Структура 'Score' +data Score = Score { matches :: !Int -- ^ Count of matches + , diffs :: !Int -- ^ Count of differences (errors) or 'fullScore' + } + +-- +-- | Json format: +-- +-- [ +-- { +-- "title": "Page one", +-- "authors": "Author I, Author II", +-- "persons": "Editor, Translator", +-- "content": "This is article about Foo and Bar.", +-- "tags": "foo, bar", +-- "year": "1990", +-- "url": "/pages/foo.html" +-- }, +-- {...} +-- ] +-- +data Json = Json { title :: !Text + , authors :: !Text + , persons :: !Text + , content :: !Text + , tags :: !Text + , year :: !Text + , url :: !Text } deriving (Generic) + +instance FromJSON Json + +-- +-- | Json output preprocessing +-- +-- * URLs are decoded. +-- * "content" is shortened. +-- +instance ToJSON Json where + toJSON jsonObject = + object [ "title" .= title jsonObject + , "authors" .= authors jsonObject + , "year" .= year jsonObject + , "url" .= (url jsonObject & URI.decodeText) + , "content" .= (content jsonObject & shortenWith "..." maxContentLength) ] + -- "tags" .= (tags jsonObject & shortenWith "..." maxContentLength) ] + +-- ----------------------------------------------------------------------------- + +-- +-- | Shorten text to specified length and append a given suffix. +-- +-- >>> shortenWith "" 8 "1234567890" +-- "12345678" +-- +-- >>> shortenWith "..." 8 "1234567890" +-- "12345..." +-- +shortenWith :: Text -> Int -> Text -> Text +shortenWith suffix n string | T.length string >= n = T.strip (T.take (n - T.length suffix) string) <> suffix + | otherwise = string + +-- +-- | +-- +valid :: Score -> Bool +valid (Score x y) = x > 0 && y < 0 + +-- | +sumall :: (a -> Int) -> [a] -> Int +sumall fn = sum . map fn + +-- +-- | Calculation of relevance: +-- +-- * For each complete match with the template (score == 0), +-- 'fullScore' points are given (100 by default), +-- the remaining points are added together. +-- +-- * Result with a minus is used for +-- correct sorting by decreasing accuracy of results. +-- +-- >>> getScore [Score { matches = 1, diffs = 0 }, Score { matches = 2, diffs = 1 }] +-- "-1020" +-- +getScore :: [Score] -> Int +getScore = sumall scores + where scores (Score x 0) = fullScore * x + scores (Score x s) = -x * scoreMultiplier `div` s + +-- +-- | Splitting text into separate words. +-- +-- * Keeping only alpha-num characters. +-- * Deleting words shorter than 'minPatternLength'. +-- +-- >>> tokenize "an alpha, a beta: a gam-ma; and a (delta)" +-- ["alpha", "beta", "gam", "ma", "delta"] +-- +tokenize :: Text -> [Text] +tokenize xs = filter (\w -> T.length w >= minPatternLength) wds + where wds = T.words $ T.map isChar xs + isChar ch | isAlphaNum ch = replaceChar . toLower $ ch + | otherwise = ' ' + +-- | [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) computing +levenshtein :: Text -> Text -> Int +levenshtein str txt = levenshteinDistance defaultEditCosts (toString str) (toString txt) + +-- +-- | Comparing a word with all the words of a text +-- and returning the 'Score'. +-- +findWord :: [Text] -> Text -> Score +findWord txt word | word `elem` txt = Score { matches = 1, diffs = 0 } + | otherwise = Score { matches = lns, diffs = ms } + where ls = [ l | wd <- txt, let l = levenshtein word wd, l < outOfScore ] + lns = length ls + ms | lns > 0 = minimum ls + | otherwise = outOfScore + +-- +-- | Evaluation of the sameness of two strings. +-- +-- >>> locate "string" "string" Strict +-- Score 1 -1000 -- exact match without any errors +-- +-- >>> locate "string" "strings" Fuzzy +-- Score 1 -10 -- match without one difference +-- +-- >>> locate "string" "text" Fuzzy +-- Score 0 outOfScore -- string not found +-- +locate :: Text -> Text -> FindStrictness -> Score +-- Empty string or empty text +locate (T[]) _ _ = Score { matches = 0, diffs = outOfScore } +locate _ (T[]) _ = Score { matches = 0, diffs = outOfScore } +-- Strict search +locate str txt Strict | T.toLower str == T.toLower txt = Score { matches = 1, diffs = fullScore } + | otherwise = Score { matches = 0, diffs = outOfScore } +-- Fuzzy search +locate str txt Fuzzy = Score { matches = sumall matches trs, diffs = getScore trs } + where trs = findWord wds <$> sxs + wds = tokenize txt + sxs = nub $ tokenize str + +-- +-- | Alias for 'collision'. +-- +-- >>> "string" ==~ "Text with string" +-- Score 1 fullScore +-- +(==~) = locate + +-- +-- | Compare all file fields with a string. +-- +anyMatch :: Text -> Json -> Score +anyMatch string jsonObject = Score { matches = length l, diffs = sumall diffs l } + where l = filter valid [ string ==~ fullname $ Fuzzy, + string ==~ content jsonObject $ Fuzzy, + string ==~ year jsonObject $ Strict, + string ==~ tags jsonObject $ Fuzzy + ] + fullname = persons jsonObject <> " " <> title jsonObject +-- +-- | Alias for 'anyMatch'. +-- +-- >>> "string" `within` Json +-- +within = anyMatch + +-- +-- | Search for Json objects which include passing string. +-- +findJsonObjects :: Maybe [Json] -> Text -> [(Json, Int, Int)] +findJsonObjects Nothing _ = [] +findJsonObjects (Just objs) string | T.length string < minPatternLength = [] + | otherwise = foundPages + where foundPages = [ (obj, mcs, dfs) | obj <- objs, + let item = string `within` obj, valid item, + let mcs = matches item, + let dfs = diffs item, + then sortWith by dfs] + diff --git a/src/Server.hs b/src/Server.hs new file mode 100644 index 0000000..921b18f --- /dev/null +++ b/src/Server.hs @@ -0,0 +1,201 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE DeriveDataTypeable #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE Unsafe #-} + +-- ----------------------------------------------------------------------------- + +-- Copyright (C) 2019, Maxim Lihachev, + +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU General Public License as published by +-- the Free Software Foundation, either version 3 of the License, or +-- (at your option) any later version. + +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU General Public License for more details. + +-- You should have received a copy of the GNU General Public License +-- along with this program. If not, see . + +-- ----------------------------------------------------------------------------- + +-- +-- | Scotty-based web service which provides API for "Search" module. +-- +module Server (startServer, ServerSettings (..), checkHealth, checkJsonFile) where + + +import Relude ( Maybe (Just, Nothing), String, Text, Int, Bool, IO, Show + , ($), (.), (>>), (=<<), (>>=), (<>) + , otherwise, liftIO, show, return, replicate ) + +import Data.Aeson ( ToJSON, decodeFileStrict' ) +import Web.Scotty ( ActionM, scotty, get, json, notFound, param, middleware, redirect ) +import Network.Wai ( Middleware, Application, Request, Response, ResponseReceived, rawQueryString, rawPathInfo ) +import Network.Wai.Middleware.RequestLogger.JSON ( formatAsJSON ) +import Network.Wai.Middleware.RequestLogger ( logStdout, logStdoutDev, mkRequestLogger, outputFormat + , OutputFormat (CustomOutputFormat, CustomOutputFormatWithDetails) ) + +import Text.Pretty.Simple ( pPrint ) +import Foreign.Marshal.Unsafe ( unsafeLocalState ) +import GHC.Generics ( Generic ) +import Data.Default.Class ( Default (def) ) + +import System.Console.CmdArgs ( Data ) +import System.IO ( openFile, hClose, IOMode (ReadMode) ) +import Control.Exception ( handle, IOException ) +import Control.Monad ( join ) +import Data.Time ( getCurrentTime, UTCTime ) +import Data.Text.Encoding ( decodeUtf8 ) +import Network.URI.Encode ( decodeText ) + +import Search ( findJsonObjects, Json ) + +-- ----------------------------------------------------------------------------- + +-- +-- | Server settings' format. +-- +data ServerSettings = ServerSettings { port :: Int + , logs :: String + , cached :: Bool + , file :: Maybe String + } deriving (Show, Data, Generic, ToJSON) + +-- +-- | Healthcheck's format. +-- +data Health = Health { status :: !String + , message :: !String } deriving (Show, Generic, ToJSON) + +-- ----------------------------------------------------------------------------- + +-- +-- | If 'filename' is readable then "ok" or error message in other case. +-- +checkJsonFile :: Maybe String -> String +checkJsonFile Nothing = "ERROR: Data file is not specified" +checkJsonFile (Just filename) = unsafeLocalState + $ handle (\ (e :: IOException) -> return $ show e) + $ openFile filename ReadMode >>= hClose >> return "ok" + +-- +-- | Service checking. +-- +-- If file is exists and readable return "ok". +-- +-- >>> curl json-search-server:port/health +-- { +-- "status":"ok", +-- "message":"" +-- } +-- +-- Otherwise return "warning" (if cache is used) or "fail" status with error message. +-- +-- >>> curl json-search-server:port/health +-- { +-- "status":"fail", +-- "message":"broken.json: openFile: does not exist (No such file or directory)" +-- } +-- +checkHealth :: UTCTime -> Bool -> Maybe String -> IO (ActionM ()) +checkHealth serverStartTime withCache filename = + let currentStatus = checkJsonFile filename in do + time <- getCurrentTime + + return $ json $ case currentStatus of + "ok" -> + if withCache + then Health "ok" ("cache was loaded " <> show serverStartTime) + else Health "ok" (show time) + _err -> + if withCache + then Health ("warning: using cache from " <> show serverStartTime) currentStatus + else Health "fail" (show time <> " " <> currentStatus) + +-- +-- | Read JSON data from file. +-- +-- NOINLINE pragma uses to force read file each function call. +-- +loadJSON :: Maybe String -> Maybe [Json] +loadJSON Nothing = Nothing +loadJSON (Just filename) = unsafeLocalState $ decodeFileStrict' filename +{-# NOINLINE loadJSON #-} + +-- ----------------------------------------------------------------------------- + +-- +-- | Run a search with or without cache. +-- +-- If argument --cached was passed, it uses the stored data. +-- In another case, re-reads the data from the file. +-- +findWithCache :: Maybe [Json] -> Maybe String -> Text -> [(Json, Int, Int)] +findWithCache Nothing filename string = findJsonObjects (loadJSON filename) string +findWithCache cache _ string = findJsonObjects cache string + +-- +-- | Format or omit all Middleware output. +-- +formatLog :: String -> IO Middleware +formatLog "json" = mkRequestLogger def { outputFormat = CustomOutputFormatWithDetails formatAsJSON } +formatLog _none = mkRequestLogger def { outputFormat = CustomOutputFormat (\_ _ _ _ -> "") } + +-- +-- | Log types. +-- +logRequest :: String -> Application -> Request -> (Response -> IO ResponseReceived) -> IO ResponseReceived +logRequest format app req respond = + let ret f = f app req respond + header = replicate 120 + in + case format of + "full" -> do + pPrint $ header '~' + pPrint . show =<< getCurrentTime + pPrint $ header '-' + pPrint $ "Query: " + <> (decodeText . decodeUtf8 $ rawPathInfo req) + <> (decodeText . decodeUtf8 $ rawQueryString req) + pPrint $ header '-' + pPrint req + + ret =<< formatLog "none" + + "disable" -> ret =<< formatLog "none" + + "json" -> ret =<< formatLog "json" + + "simple" -> ret logStdoutDev + + _apache -> ret logStdout + +-- ----------------------------------------------------------------------------- + +-- +-- | Start Scotty server with passing 'ServerSettings'. +-- +startServer :: ServerSettings -> IO () +startServer settings@(ServerSettings serverPort logFormat withCache filename) = + let cache | withCache = loadJSON filename + | otherwise = Nothing + in do + serverStartTime <- getCurrentTime + + scotty serverPort $ do + middleware $ logRequest logFormat + + get "/info" $ json settings + get "/health" $ join $ liftIO $ checkHealth serverStartTime withCache filename + get "/search" $ json . findWithCache cache filename =<< param "query" + get "/search/:query" $ json . findWithCache cache filename =<< param "query" + + notFound $ redirect "/info" + diff --git a/src/Settings.hs b/src/Settings.hs new file mode 100644 index 0000000..3e6fd53 --- /dev/null +++ b/src/Settings.hs @@ -0,0 +1,66 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE Safe #-} + +-- ----------------------------------------------------------------------------- + +-- Copyright (C) 2019, Maxim Lihachev, + +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU General Public License as published by +-- the Free Software Foundation, either version 3 of the License, or +-- (at your option) any later version. + +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU General Public License for more details. + +-- You should have received a copy of the GNU General Public License +-- along with this program. If not, see . + +-- ----------------------------------------------------------------------------- + +-- +-- | Fuzzy search settings. +-- +module Settings ( maxContentLength + , minPatternLength + , scoreMultiplier + , outOfScore + , fullScore + , replaceChar ) +where + +import Relude ( Int, Char, (+), (*) ) + +-- ----------------------------------------------------------------------------- + +-- | Maximum length of strings in resulting json. +maxContentLength :: Int +maxContentLength = 160 + +-- | Minimum length of search pattern. +minPatternLength :: Int +minPatternLength = 4 + +-- | Maximum count of differencies between the search words and the string. +maxVariation :: Int +maxVariation = 2 + +-- | Awarded for insufficient string matching. +outOfScore :: Int +outOfScore = maxVariation + 1 + +-- | Awarded with the full concurrence with the string. +fullScore :: Int +fullScore = -10 * scoreMultiplier + +-- | The larger the number, the more accurate the sorting by relevance. +scoreMultiplier :: Int +scoreMultiplier = 10 + +-- | List of characters to replace during text preprocessing. +replaceChar :: Char -> Char +replaceChar 'ё' = 'е' +replaceChar ch = ch + diff --git a/stack.yaml b/stack.yaml new file mode 100644 index 0000000..dabccd3 --- /dev/null +++ b/stack.yaml @@ -0,0 +1,64 @@ +# This file was automatically generated by 'stack init' +# +# Some commonly used options have been documented as comments in this file. +# For advanced use and comprehensive documentation of the format, please see: +# https://docs.haskellstack.org/en/stable/yaml_configuration/ + +# Resolver to choose a 'specific' stackage snapshot or a compiler version. +# A snapshot resolver dictates the compiler version and the set of packages +# to be used for project dependencies. For example: +# +# resolver: lts-3.5 +# resolver: nightly-2015-09-21 +# resolver: ghc-7.10.2 +# +# The location of a snapshot can be provided as a file or url. Stack assumes +# a snapshot provided as a file might change, whereas a url resource does not. +# +# resolver: ./custom-snapshot.yaml +# resolver: https://example.com/snapshots/2018-01-01.yaml +resolver: lts-13.23 + +# User packages to be built. +# Various formats can be used as shown in the example below. +# +# packages: +# - some-directory +# - https://example.com/foo/bar/baz-0.0.2.tar.gz +# - location: +# git: https://github.com/commercialhaskell/stack.git +# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a +# - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a +# subdirs: +# - auto-update +# - wai +packages: +- . +# Dependency packages to be pulled from upstream that are not in the resolver +# using the same syntax as the packages field. +# (e.g., acme-missiles-0.3) +# extra-deps: [] + +# Override default flag values for local packages and extra-deps +# flags: {} + +# Extra package databases containing global packages +# extra-package-dbs: [] + +# Control whether we use the GHC we find on the path +# system-ghc: true +# +# Require a specific version of stack, using version ranges +# require-stack-version: -any # Default +# require-stack-version: ">=1.9" +# +# Override the architecture used by stack, especially useful on Windows +# arch: i386 +# arch: x86_64 +# +# Extra directories used by stack for building +# extra-include-dirs: [/path/to/dir] +# extra-lib-dirs: [/path/to/dir] +# +# Allow a newer minor version of GHC than the snapshot specifies +# compiler-check: newer-minor