From f0dbe295d7d4f8282c2bde86bc4387b367bef589 Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Fri, 3 Mar 2017 19:57:20 +0100 Subject: added a first strus search to the site --- strus/README | 23 +++++++++++++++++++++++ strus/create_xml.sh | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ strus/document.ana | 25 +++++++++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 strus/README create mode 100755 strus/create_xml.sh create mode 100644 strus/document.ana (limited to 'strus') diff --git a/strus/README b/strus/README new file mode 100644 index 0000000..d503c0c --- /dev/null +++ b/strus/README @@ -0,0 +1,23 @@ +# Search index with strus + +# For now create an XML from the content, later have a directory iterator +# over 'content' and read TOML/YAML headers and markdown... + +# TODO: this becomes obsolete with a Hugo segmenter which undestands +# YAML/TOML/JSON and Markdown: +# remarshal (https://github.com/dbohdan/remarshal) +# pandoc (http://pandoc.org/) + +./create_xml.sh > posts.xml + +xmllint -noout posts.xml + +# Create the strus search index: + +rm -rf storage +mkdir storage +strusCreate -s 'path=storage/wwwandreasbaumanncc; metadata=doclen UINT16, publish_date UINT16' + +strusAnalyze document.ana posts.xml |& less + +strusInsert -c 1000 -f 1 -t 1 -s "path=storage/wwwandreasbaumanncc" document.ana posts.xml diff --git a/strus/create_xml.sh b/strus/create_xml.sh new file mode 100755 index 0000000..fa7ddd6 --- /dev/null +++ b/strus/create_xml.sh @@ -0,0 +1,50 @@ +#!/bin/sh + +cat < + +EOF + +for file in `find ../content/ -name '*.md'`; do + echo "$file.." 1>&2 + slug=`echo $file | sed 's@../content@@g' | sed 's@/_index.md$@@g' | sed 's@.md$@@g'` + if test "x$slug" = "x"; then + slug="/" + fi + + awk 'BEGIN { i = 0 } /\+\+\+/{x="F"++i;}{print > x;}' $file >/dev/null 2>&1 + + if test ! -f F1 -a -f F2; then + continue + fi + + tail -n +2 F1 > meta.toml + tail -n +3 F2 > body.md + + sed -i 's/\&/&/g' meta.toml + sed -i 's//\>/g' meta.toml + sed -i 's/\&/&/g' body.md + sed -i 's//\>/g' body.md + + remarshal -if toml -of json meta.toml > meta.json + pandoc -f markdown -t docbook body.md > body.xml + + echo "" + echo "$slug" + echo "$file" + echo "" + cat meta.json + echo "" + echo "" + cat body.xml + echo "" + echo "" + + rm -f meta.* body.* F1 F2 +done + +cat < +EOF diff --git a/strus/document.ana b/strus/document.ana new file mode 100644 index 0000000..a296775 --- /dev/null +++ b/strus/document.ana @@ -0,0 +1,25 @@ +[Attribute] + docid = orig content /posts/post/slug(); + filename = orig content /posts/post/filename(); +# title = orig content /posts/post/meta()/title(); + +[Content] + "encoding=UTF-8; content=JSON;" /posts/post/meta(); + +[SearchIndex] +# word = lc:convdia(en):stem(en):lc word /posts/post/body/para(); +# word = orig split /posts/post/body/para(); + word = lc regex("([A-Za-z']+)") /posts/post/body/para(); + +[ForwardIndex] + word = orig split /posts/post/body/para(); + +[Document] + post = /posts/post; + +#[MetaData] +# release_date = date2int("d 1877-01-01", "%Y-%m-%d %H:%M:%s *") content /posts/post/meta()/date; + +[Aggregator] + doclen = count( word ); + -- cgit v1.2.3-54-g00ecf