summaryrefslogtreecommitdiff
path: root/strus
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2017-03-03 19:57:20 +0100
committerAndreas Baumann <mail@andreasbaumann.cc>2017-03-03 19:57:20 +0100
commitf0dbe295d7d4f8282c2bde86bc4387b367bef589 (patch)
tree4f09c448098b57656c789d7dd358271498c34bc5 /strus
parent6050f5e92d0386944cddd0adc9163bfe0f88fea3 (diff)
downloadwww-andreasbaumann-cc-f0dbe295d7d4f8282c2bde86bc4387b367bef589.tar.gz
www-andreasbaumann-cc-f0dbe295d7d4f8282c2bde86bc4387b367bef589.tar.bz2
added a first strus search to the site
Diffstat (limited to 'strus')
-rw-r--r--strus/README23
-rwxr-xr-xstrus/create_xml.sh50
-rw-r--r--strus/document.ana25
3 files changed, 98 insertions, 0 deletions
diff --git a/strus/README b/strus/README
new file mode 100644
index 0000000..d503c0c
--- /dev/null
+++ b/strus/README
@@ -0,0 +1,23 @@
+# Search index with strus
+
+# For now create an XML from the content, later have a directory iterator
+# over 'content' and read TOML/YAML headers and markdown...
+
+# TODO: this becomes obsolete with a Hugo segmenter which undestands
+# YAML/TOML/JSON and Markdown:
+# remarshal (https://github.com/dbohdan/remarshal)
+# pandoc (http://pandoc.org/)
+
+./create_xml.sh > posts.xml
+
+xmllint -noout posts.xml
+
+# Create the strus search index:
+
+rm -rf storage
+mkdir storage
+strusCreate -s 'path=storage/wwwandreasbaumanncc; metadata=doclen UINT16, publish_date UINT16'
+
+strusAnalyze document.ana posts.xml |& less
+
+strusInsert -c 1000 -f 1 -t 1 -s "path=storage/wwwandreasbaumanncc" document.ana posts.xml
diff --git a/strus/create_xml.sh b/strus/create_xml.sh
new file mode 100755
index 0000000..fa7ddd6
--- /dev/null
+++ b/strus/create_xml.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+cat <<EOF
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<posts>
+EOF
+
+for file in `find ../content/ -name '*.md'`; do
+ echo "$file.." 1>&2
+ slug=`echo $file | sed 's@../content@@g' | sed 's@/_index.md$@@g' | sed 's@.md$@@g'`
+ if test "x$slug" = "x"; then
+ slug="/"
+ fi
+
+ awk 'BEGIN { i = 0 } /\+\+\+/{x="F"++i;}{print > x;}' $file >/dev/null 2>&1
+
+ if test ! -f F1 -a -f F2; then
+ continue
+ fi
+
+ tail -n +2 F1 > meta.toml
+ tail -n +3 F2 > body.md
+
+ sed -i 's/\&/&amp;/g' meta.toml
+ sed -i 's/</\&lt;/g' meta.toml
+ sed -i 's/>/\&gt;/g' meta.toml
+ sed -i 's/\&/&amp;/g' body.md
+ sed -i 's/</\&lt;/g' body.md
+ sed -i 's/>/\&gt;/g' body.md
+
+ remarshal -if toml -of json meta.toml > meta.json
+ pandoc -f markdown -t docbook body.md > body.xml
+
+ echo "<post>"
+ echo "<slug>$slug</slug>"
+ echo "<filename>$file</filename>"
+ echo "<meta>"
+ cat meta.json
+ echo "</meta>"
+ echo "<body>"
+ cat body.xml
+ echo "</body>"
+ echo "</post>"
+
+ rm -f meta.* body.* F1 F2
+done
+
+cat <<EOF
+</posts>
+EOF
diff --git a/strus/document.ana b/strus/document.ana
new file mode 100644
index 0000000..a296775
--- /dev/null
+++ b/strus/document.ana
@@ -0,0 +1,25 @@
+[Attribute]
+ docid = orig content /posts/post/slug();
+ filename = orig content /posts/post/filename();
+# title = orig content /posts/post/meta()/title();
+
+[Content]
+ "encoding=UTF-8; content=JSON;" /posts/post/meta();
+
+[SearchIndex]
+# word = lc:convdia(en):stem(en):lc word /posts/post/body/para();
+# word = orig split /posts/post/body/para();
+ word = lc regex("([A-Za-z']+)") /posts/post/body/para();
+
+[ForwardIndex]
+ word = orig split /posts/post/body/para();
+
+[Document]
+ post = /posts/post;
+
+#[MetaData]
+# release_date = date2int("d 1877-01-01", "%Y-%m-%d %H:%M:%s *") content /posts/post/meta()/date;
+
+[Aggregator]
+ doclen = count( word );
+