From 5ced26f343a43cb485c0f9cf76e64a5a76466fa7 Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Wed, 12 Apr 2017 17:15:12 +0200 Subject: added a web search article (still unpolished und unpublished) --- config.toml | 2 +- content/blog/web-search-homepage.md | 259 +++++++++++++++++++++++ static/images/blog/web-search-homepage/strus.jpg | Bin 0 -> 36406 bytes 3 files changed, 260 insertions(+), 1 deletion(-) create mode 100644 content/blog/web-search-homepage.md create mode 100644 static/images/blog/web-search-homepage/strus.jpg diff --git a/config.toml b/config.toml index 736bfda..14fa0e3 100644 --- a/config.toml +++ b/config.toml @@ -14,7 +14,7 @@ SectionPagesMenu = "main" location = "Zurich" copyright_years = "2009 - 2017" opengraph = true - strus_url = "http://euromac.local:8080/strus/query/wwwandreasbaumanncc" + strus_url = "http://eeepc.home:8080/strus/query/wwwandreasbaumanncc" [Params.widgets] search = true diff --git a/content/blog/web-search-homepage.md b/content/blog/web-search-homepage.md new file mode 100644 index 0000000..e7083bb --- /dev/null +++ b/content/blog/web-search-homepage.md @@ -0,0 +1,259 @@ ++++ +draft = true +title = "Web search for my homepage" +date = "2017-04-12T15:49:11+01:00" +categories = [ "Strus", "Search", "Information Retrieval" ] +thumbnail = "/images/blog/web-search-homepage/strus.jpg" ++++ + +## Intro +I wanted to add a search function to my web page. +As the website is build with Hugo as a set of static +HTML pages onto a read-only web server, standard +approaches didn't work like a LIKE-query in Mysql +as many CMS are implementing search. + +The big logo gives it away, it's done with the +strusWebService from the [Strus](http://project-strus.net/) +project. + +The basic idea is that the author of the web pages +can build a search index locally with the markdown version +of his content and then push it to a webservice dedicated +to search only. Again, the files making up the search index +can be set to read-only after an update, leaving the system +open to only DOSA or DDOSA (but which public system isn't). + +## Installing strus for content indexing + +So, I installed the packages 'strusutilties' for ArchLinux +on my local machine from the +[OpenBuildService](https://software.opensuse.org/download.html?project=home:andreas_baumann&package=strusutilities) +with: + +``` +pacman -S strusutilities +``` + +This gives me the command line tools to build a search index. + +## Generating the documents readable by strus + +The command line tools consist of tools to analyze the document, +apply some basic parsing and normalization of search terms. + +The tools take XML, JSON or TSV (tab-separated-values) currently. +My Hugo documents have their metadata in TOML and the content in +Markdown: + +``` ++++ +title = "Web search for my homepage" +date = "2017-04-12T15:49:11+01:00" +categories = [ "Strus", "Search", "Information Retrieval" ] +thumbnail = "/images/blog/web-search-homepage/strus.jpg" ++++ + +I wanted to add a search function to my web page. +As the website is build with Hugo as a set of static +... +``` + +So I first have to convert the blog entries to a big XML +file using: + + * [remarshal](https://github.com/dbohdan/remarshal): converts + the metadata in TOML/YAML to JSON + * [pandoc](http://pandoc.org/): convert markdown to + tons of formats + +I choose to convert to a DocBook style of XML and put all +the posts into one big file called `posts.xml`. The metadata is +embedded as a JSON value into the XML file in a tag ``. + +The final XML file looks like: + +``` + + + /blog/web-search-homepage + ../content/blog/.../xx.md + + { + "categories" : ["Strus","Search", + "Information Retrieval"], + "date" : "2017-04-12T15:49:11+01:00", + "thumbnail" : "/images/blog/.../strus.jpg", + "title" : "Web search for my homepage" + } + + + + I wanted to add a search function to my + web page. As the website is build with + Hugo as a set of static +... +``` + +The cool thing about the way Strus is analyzing the document +is that you can choose where to split the document into logical +documents (in our case the blog entries and web pages) and +you can switch the format from XML to JSON in embedded content. + +I packaged this whole ugly conversion step into a script like that: + +``` +./create_xml.sh > posts.xml +``` + +## Configuring the document analysis and indexing process + +Now we define the configuration for the text analysis. Basically +we tell the system where to split the document into retrievable +items, which features we want to be able to search for and what +attributes and text we want to show in the ranklist. + +The file `document.ana` contains a configuration which describes +how Strus should analyze and index the documents: + +``` +[Document] + post = /posts/post; +``` + +which means split on the XPath expession `/posts/post`. + +``` +[Content] + "encoding=UTF-8; content=JSON;" /posts/post/meta(); +``` + +This says that our metadata tag contains a different format JSON. +Later we can select the individual fields inside the XML tag `meta` +by specifying a path to the JSON key: + +``` +[Attribute] + ... + title = orig content /posts/post/meta()/title(); + categories = orig content /posts/post/meta()/categories(); + thumbnail = orig content /posts/post/meta()/thumbnail(); + ... +``` + +The other attributes we select from the XML tags directly: + +``` +[Attribute] + docid = orig content /posts/post/slug(); +``` + +Now for the things we want to be able to search for. For now +simply stemmed words in English are fine: + +``` +[SearchIndex] + word = lc:convdia(en):stem(en):lc regex("([A-Za-z']+)") /posts/post/meta()/title(); + word = lc:convdia(en):stem(en):lc regex("([A-Za-z']+)") /posts/post/body//para(); + sentence = empty punctuation("en") /posts/post/body//para(); +``` + +The sentence marker is used to pick the best sentences later +when presenting the hit in the ranlist. + +``` +[ForwardIndex] + title = orig split /posts/post/meta()/title(); + text = orig split /posts/post/body//para(); +``` + +The forward index stores the document almost verbatim as a sequence +of title and text tokens. So when we get a hit in a search result +we can present a selection of them (usually a sentence containing +the matches) in the ranklist. + +Finally, we need to count the number of words per document, +this is needed or the retrieval function: + +``` +[Aggregator] + doclen = count( word ); +``` + +With that we can now index our collection and build a search index: + +``` +strusCreate -s 'path=storage/wwwandreasbaumanncc; metadata=doclen UINT16, publish_date UINT16' +strusInsert -c 1000 -f 1 -t 1 -s "path=storage/wwwandreasbaumanncc" document.ana posts.xml +``` + +The `storage` directory contains the complete search index which we +can copy to the server running the `strusWebService`. + +## Installing the strusWebService for querying + +On a publicly available server I installed the 'strusWebService': + +[OpenBuildService](https://software.opensuse.org/download.html?project=home:andreas_baumann&package=struswebservice) +with: + +``` +pacman -S struswebservice +``` + +The search index I copied into `/srv/strusWebService/storage`. + +And I started the services: + +``` +systemctl enable strusWebService +systemctl start strusWebService +``` + +## Extending the Hugo theme adding a search text field + +So, the static web page needs a search field now and +some Javascript code doing the search request to the +strusWebService (with JSON on HTTP). + +This leads to the problem of CORS (Cross-Origin-Requests) +which have to be configured in the strusWebService (which +must allow requests originating from the server which +delivered the static HTML page generated with Hugo. + +Also I had to stem and lowercase the query terms with +a [Snowball Javascript library](https://github.com/fortnightlabs/snowball-js) +as the strusWebService doens't support query analysis right now. + +## Workflow + +So, when I change or add a page in to my website I have to run: + +``` +./create_xml.sh > posts.xml +rm -rf storage +mkdir storage +strusCreate -s 'path=storage/wwwandreasbaumanncc; metadata=doclen UINT16, publish_date UINT16' +strusInsert -c 1000 -f 1 -t 1 -s "path=storage/wwwandreasbaumanncc" document.ana posts.xml +scp storage/wwwandreasbaumann.cc struswebservice.home:/srv/strusWebService/storage +``` + +and of course stop and restart the strusWebService because I replaced +it's search index completely. + +# Outlook and future steps + +There are several things I want to improve from here. + +First, the conversion to XML/JSON is quite hacky and slow. +Writting segmenters for strus for new file formats like +TOML or markdown is not that complicated. + +A future web service should also support indexing, otherwise +we cannot properly analyze the query (using the Snowball JS stemmer +in the browser is a hack which doesn't work if the query analysis +is more complex). + +The upload to the remote webserver can be done more elegantly. +One idea is to use the backup/replication API of strus for this: +build an index locally and sync it to the strusWebService. diff --git a/static/images/blog/web-search-homepage/strus.jpg b/static/images/blog/web-search-homepage/strus.jpg new file mode 100644 index 0000000..70c0776 Binary files /dev/null and b/static/images/blog/web-search-homepage/strus.jpg differ -- cgit v1.2.3-54-g00ecf