diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2014-10-09 08:59:02 +0200 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2014-10-09 08:59:02 +0200 |
commit | 7d8b1ff684b412da292e0fc734748975188a0f10 (patch) | |
tree | 2673e3da51cc80bfc38a426048b30a4d71c31d4c /tolua/src/bin/lua/clean.lua | |
parent | 62c5bb90525baf0d82c23892c2666f611750d63c (diff) | |
download | crawler-7d8b1ff684b412da292e0fc734748975188a0f10.tar.gz crawler-7d8b1ff684b412da292e0fc734748975188a0f10.tar.bz2 |
first trials with a Google normalizer called from Lua, std::string is the problem currently
and the missing wrapper for the URL class
also added a local 'tolua', we will have to hack it
Diffstat (limited to 'tolua/src/bin/lua/clean.lua')
-rw-r--r-- | tolua/src/bin/lua/clean.lua | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/tolua/src/bin/lua/clean.lua b/tolua/src/bin/lua/clean.lua new file mode 100644 index 0000000..2442291 --- /dev/null +++ b/tolua/src/bin/lua/clean.lua @@ -0,0 +1,75 @@ +-- mark up comments and strings
+STR1 = "\001"
+STR2 = "\002"
+STR3 = "\003"
+STR4 = "\004"
+REM = "\005"
+ANY = "([\001-\005])"
+ESC1 = "\006"
+ESC2 = "\007"
+
+MASK = { -- the substitution order is important
+ {ESC1, "\\'", "\\'"},
+ {ESC2, '\\"', '\\"'},
+ {STR1, "'", "'"},
+ {STR2, '"', '"'},
+ {STR3, "%[%[", "[["},
+ {STR4, "%]%]", "]]"},
+ {REM , "%-%-", "--"},
+}
+
+function mask (s)
+ for i = 1,getn(MASK) do
+ s = gsub(s,MASK[i][2],MASK[i][1])
+ end
+ return s
+end
+
+function unmask (s)
+ for i = 1,getn(MASK) do
+ s = gsub(s,MASK[i][1],MASK[i][3])
+ end
+ return s
+end
+
+function clean (s)
+ -- check for compilation error
+ local code = "return function () " .. s .. " end"
+ if not dostring(code) then
+ return nil
+ end
+
+ local S = "" -- saved string
+
+ s = mask(s)
+
+ -- remove blanks and comments
+ while 1 do
+ local b,e,d = strfind(s,ANY)
+ if b then
+ S = S..strsub(s,1,b-1)
+ s = strsub(s,b+1)
+ if d==STR1 or d==STR2 then
+ e = strfind(s,d)
+ S = S ..d..strsub(s,1,e)
+ s = strsub(s,e+1)
+ elseif d==STR3 then
+ e = strfind(s,STR4)
+ S = S..d..strsub(s,1,e)
+ s = strsub(s,e+1)
+ elseif d==REM then
+ s = gsub(s,"[^\n]*(\n?)","%1",1)
+ end
+ else
+ S = S..s
+ break
+ end
+ end
+ -- eliminate unecessary spaces
+ S = gsub(S,"[ \t]+"," ")
+ S = gsub(S,"[ \t]*\n[ \t]*","\n")
+ S = gsub(S,"\n+","\n")
+ S = unmask(S)
+ return S
+end
+
|