better GDPR page

author: Andreas Baumann <mail@andreasbaumann.cc> 2018-04-27 20:45:12 +0200
committer: Andreas Baumann <mail@andreasbaumann.cc> 2018-04-27 20:45:12 +0200
commit: 74087a15b63dbc062e1c8a785917f44cda5f14c7 (patch)
tree: bd20677420b33aae27bb3ba6065c1757880caecf /search
parent: e372ee22511a0e084b7957316909705bf9628d01 (diff)
download: www-andreasbaumann-cc-74087a15b63dbc062e1c8a785917f44cda5f14c7.tar.gz
www-andreasbaumann-cc-74087a15b63dbc062e1c8a785917f44cda5f14c7.tar.bz2
5 files changed, 193 insertions, 0 deletions
diff --git a/search/elasticlunr/README b/search/elasticlunr/README
new file mode 100644
index 0000000..c1c318c
--- /dev/null
+++ b/search/elasticlunr/README
@@ -0,0 +1,24 @@
+# Search index with elasticlunr
+
+# Create a search index which can be served statically along the 
+# static HTML pages to staticlunr.js.
+
+# YAML/TOML/JSON and Markdown:
+# remarshal (https://github.com/dbohdan/remarshal)
+# pandoc (http://pandoc.org/)
+
+# create XML and clean up some problematic constructs
+./create_xml.sh > posts.xml
+sed -i 's/xlink:href/href/g' posts.xml
+sed -i 's/ xml:id="[^"]\+"//g' posts.xml
+xmllint -noout posts.xml
+
+# convert XML to JSON (at least the relevant fields for the index)
+
+# use posts.json in a | node create_index.js pipeline
+# -> results in posts_index.json
+
+# add as static contents to hugo site
+
+# load from JS search code on demand (first query) if possible,
+# if small, do it immediatelly when loading the search widget.
diff --git a/search/elasticlunr/create_xml.sh b/search/elasticlunr/create_xml.sh
new file mode 100755
index 0000000..50ce4b3
--- /dev/null
+++ b/search/elasticlunr/create_xml.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+cat <<EOF
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<posts>
+EOF
+
+if test `uname -s` = 'Darwin'; then
+	SED=gsed
+else
+	SED=sed
+fi
+
+for file in `find ../../content/ -name '*.md'`; do
+	echo "$file.." 1>&2
+	slug=`echo $file | $SED 's@../../content@@g' | $SED 's@/_index.md$@@g' | $SED 's@.md$@@g'`
+	if test "x$slug" = "x"; then
+		slug="/"
+	fi
+	
+	slug=`echo $slug | sed 's@^//@/@g'`
+	
+	awk 'BEGIN { i = 0 } /\+\+\+/{x="F"++i;}{print > x;}' $file >/dev/null 2>&1
+	
+	if test ! -f F1 -a -f F2; then
+		continue
+	fi
+	
+	tail -n +2 F1 > meta.toml
+	tail -n +3 F2 > body.md
+	
+	$SED -i 's/\&/&amp;/g' meta.toml
+	$SED -i 's/</\&lt;/g' meta.toml
+	$SED -i 's/>/\&gt;/g' meta.toml
+	$SED -i 's/\&/&amp;/g' body.md
+	$SED -i 's/</\&lt;/g' body.md
+	$SED -i 's/>/\&gt;/g' body.md
+	
+	remarshal -if toml -of json meta.toml > meta.json
+	pandoc -f markdown -t docbook body.md > body.xml
+	
+	echo "<post>"
+	echo "<slug>$slug</slug>"
+	echo "<filename>$file</filename>"
+	echo "<meta>"
+	cat meta.json
+	echo "</meta>"
+	echo "<body>"
+	cat body.xml
+	echo "</body>"
+	echo "</post>"
+
+	rm -f meta.* body.* F1 F2
+done
+
+cat <<EOF
+</posts>
+EOF
diff --git a/search/strus/README b/search/strus/README
new file mode 100644
index 0000000..4f2ad15
--- /dev/null
+++ b/search/strus/README
@@ -0,0 +1,26 @@
+# Search index with strus
+
+# For now create an XML from the content, later have a directory iterator
+# over 'content' and read TOML/YAML headers and markdown...
+
+# TODO: this becomes obsolete with a Hugo segmenter which undestands
+# YAML/TOML/JSON and Markdown:
+# remarshal (https://github.com/dbohdan/remarshal)
+# pandoc (http://pandoc.org/)
+# client-side needs:
+# https://github.com/fortnightlabs/snowball-js
+
+./create_xml.sh > posts.xml
+
+xmllint -noout posts.xml
+
+# test configuration of document analysis
+
+strusAnalyze document.ana posts.xml |& less
+
+# Create the strus search index:
+
+rm -rf storage
+mkdir storage
+strusCreate -s 'path=storage/wwwandreasbaumanncc; metadata=doclen UINT16, publish_date UINT16'
+strusInsert -c 1000 -f 1 -t 1 -s "path=storage/wwwandreasbaumanncc" document.ana posts.xml
diff --git a/search/strus/create_xml.sh b/search/strus/create_xml.sh
new file mode 100755
index 0000000..50ce4b3
--- /dev/null
+++ b/search/strus/create_xml.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+cat <<EOF
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<posts>
+EOF
+
+if test `uname -s` = 'Darwin'; then
+	SED=gsed
+else
+	SED=sed
+fi
+
+for file in `find ../../content/ -name '*.md'`; do
+	echo "$file.." 1>&2
+	slug=`echo $file | $SED 's@../../content@@g' | $SED 's@/_index.md$@@g' | $SED 's@.md$@@g'`
+	if test "x$slug" = "x"; then
+		slug="/"
+	fi
+	
+	slug=`echo $slug | sed 's@^//@/@g'`
+	
+	awk 'BEGIN { i = 0 } /\+\+\+/{x="F"++i;}{print > x;}' $file >/dev/null 2>&1
+	
+	if test ! -f F1 -a -f F2; then
+		continue
+	fi
+	
+	tail -n +2 F1 > meta.toml
+	tail -n +3 F2 > body.md
+	
+	$SED -i 's/\&/&amp;/g' meta.toml
+	$SED -i 's/</\&lt;/g' meta.toml
+	$SED -i 's/>/\&gt;/g' meta.toml
+	$SED -i 's/\&/&amp;/g' body.md
+	$SED -i 's/</\&lt;/g' body.md
+	$SED -i 's/>/\&gt;/g' body.md
+	
+	remarshal -if toml -of json meta.toml > meta.json
+	pandoc -f markdown -t docbook body.md > body.xml
+	
+	echo "<post>"
+	echo "<slug>$slug</slug>"
+	echo "<filename>$file</filename>"
+	echo "<meta>"
+	cat meta.json
+	echo "</meta>"
+	echo "<body>"
+	cat body.xml
+	echo "</body>"
+	echo "</post>"
+
+	rm -f meta.* body.* F1 F2
+done
+
+cat <<EOF
+</posts>
+EOF
diff --git a/search/strus/document.ana b/search/strus/document.ana
new file mode 100644
index 0000000..8fbcf3e
--- /dev/null
+++ b/search/strus/document.ana
@@ -0,0 +1,27 @@
+[Document]
+	post = /posts/post;
+
+[Content]
+	"encoding=UTF-8; content=JSON;" /posts/post/meta();
+
+[Attribute]
+	docid = orig content /posts/post/slug();
+	title = orig content /posts/post/meta()/title();
+	categories = orig content /posts/post/meta()/categories();
+	thumbnail = orig content /posts/post/meta()/thumbnail();
+
+[SearchIndex]
+	word = lc:convdia(en):stem(en):lc regex("([A-Za-z']+)") /posts/post/meta()/title();
+	word = lc:convdia(en):stem(en):lc regex("([A-Za-z']+)") /posts/post/body//para();
+	sentence = empty punctuation("en") /posts/post/body//para();
+	
+[ForwardIndex]
+	title = orig split /posts/post/meta()/title();
+	text = orig split /posts/post/body//para();
+
+#[MetaData]
+#	release_date = date2int("d 1877-01-01", "%Y-%m-%d %H:%M:%s *") content /posts/post/meta()/date;
+
+[Aggregator]
+	doclen = count( word );
+
author	Andreas Baumann <mail@andreasbaumann.cc>	2018-04-27 20:45:12 +0200
committer	Andreas Baumann <mail@andreasbaumann.cc>	2018-04-27 20:45:12 +0200
commit	74087a15b63dbc062e1c8a785917f44cda5f14c7 (patch)
tree	bd20677420b33aae27bb3ba6065c1757880caecf /search
parent	e372ee22511a0e084b7957316909705bf9628d01 (diff)
download	www-andreasbaumann-cc-74087a15b63dbc062e1c8a785917f44cda5f14c7.tar.gz www-andreasbaumann-cc-74087a15b63dbc062e1c8a785917f44cda5f14c7.tar.bz2