diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2017-03-03 19:57:20 +0100 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2017-03-03 19:57:20 +0100 |
commit | f0dbe295d7d4f8282c2bde86bc4387b367bef589 (patch) | |
tree | 4f09c448098b57656c789d7dd358271498c34bc5 /strus | |
parent | 6050f5e92d0386944cddd0adc9163bfe0f88fea3 (diff) | |
download | www-andreasbaumann-cc-f0dbe295d7d4f8282c2bde86bc4387b367bef589.tar.gz www-andreasbaumann-cc-f0dbe295d7d4f8282c2bde86bc4387b367bef589.tar.bz2 |
added a first strus search to the site
Diffstat (limited to 'strus')
-rw-r--r-- | strus/README | 23 | ||||
-rwxr-xr-x | strus/create_xml.sh | 50 | ||||
-rw-r--r-- | strus/document.ana | 25 |
3 files changed, 98 insertions, 0 deletions
diff --git a/strus/README b/strus/README new file mode 100644 index 0000000..d503c0c --- /dev/null +++ b/strus/README @@ -0,0 +1,23 @@ +# Search index with strus + +# For now create an XML from the content, later have a directory iterator +# over 'content' and read TOML/YAML headers and markdown... + +# TODO: this becomes obsolete with a Hugo segmenter which undestands +# YAML/TOML/JSON and Markdown: +# remarshal (https://github.com/dbohdan/remarshal) +# pandoc (http://pandoc.org/) + +./create_xml.sh > posts.xml + +xmllint -noout posts.xml + +# Create the strus search index: + +rm -rf storage +mkdir storage +strusCreate -s 'path=storage/wwwandreasbaumanncc; metadata=doclen UINT16, publish_date UINT16' + +strusAnalyze document.ana posts.xml |& less + +strusInsert -c 1000 -f 1 -t 1 -s "path=storage/wwwandreasbaumanncc" document.ana posts.xml diff --git a/strus/create_xml.sh b/strus/create_xml.sh new file mode 100755 index 0000000..fa7ddd6 --- /dev/null +++ b/strus/create_xml.sh @@ -0,0 +1,50 @@ +#!/bin/sh + +cat <<EOF +<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<posts> +EOF + +for file in `find ../content/ -name '*.md'`; do + echo "$file.." 1>&2 + slug=`echo $file | sed 's@../content@@g' | sed 's@/_index.md$@@g' | sed 's@.md$@@g'` + if test "x$slug" = "x"; then + slug="/" + fi + + awk 'BEGIN { i = 0 } /\+\+\+/{x="F"++i;}{print > x;}' $file >/dev/null 2>&1 + + if test ! -f F1 -a -f F2; then + continue + fi + + tail -n +2 F1 > meta.toml + tail -n +3 F2 > body.md + + sed -i 's/\&/&/g' meta.toml + sed -i 's/</\</g' meta.toml + sed -i 's/>/\>/g' meta.toml + sed -i 's/\&/&/g' body.md + sed -i 's/</\</g' body.md + sed -i 's/>/\>/g' body.md + + remarshal -if toml -of json meta.toml > meta.json + pandoc -f markdown -t docbook body.md > body.xml + + echo "<post>" + echo "<slug>$slug</slug>" + echo "<filename>$file</filename>" + echo "<meta>" + cat meta.json + echo "</meta>" + echo "<body>" + cat body.xml + echo "</body>" + echo "</post>" + + rm -f meta.* body.* F1 F2 +done + +cat <<EOF +</posts> +EOF diff --git a/strus/document.ana b/strus/document.ana new file mode 100644 index 0000000..a296775 --- /dev/null +++ b/strus/document.ana @@ -0,0 +1,25 @@ +[Attribute] + docid = orig content /posts/post/slug(); + filename = orig content /posts/post/filename(); +# title = orig content /posts/post/meta()/title(); + +[Content] + "encoding=UTF-8; content=JSON;" /posts/post/meta(); + +[SearchIndex] +# word = lc:convdia(en):stem(en):lc word /posts/post/body/para(); +# word = orig split /posts/post/body/para(); + word = lc regex("([A-Za-z']+)") /posts/post/body/para(); + +[ForwardIndex] + word = orig split /posts/post/body/para(); + +[Document] + post = /posts/post; + +#[MetaData] +# release_date = date2int("d 1877-01-01", "%Y-%m-%d %H:%M:%s *") content /posts/post/meta()/date; + +[Aggregator] + doclen = count( word ); + |