summaryrefslogtreecommitdiffstats
path: root/indexlib/tests/large-scale
diff options
context:
space:
mode:
Diffstat (limited to 'indexlib/tests/large-scale')
-rwxr-xr-xindexlib/tests/large-scale/do-test.zsh55
-rw-r--r--indexlib/tests/large-scale/generate.py51
2 files changed, 106 insertions, 0 deletions
diff --git a/indexlib/tests/large-scale/do-test.zsh b/indexlib/tests/large-scale/do-test.zsh
new file mode 100755
index 000000000..b8d47b45d
--- /dev/null
+++ b/indexlib/tests/large-scale/do-test.zsh
@@ -0,0 +1,55 @@
+#!/usr/bin/env zsh
+
+# SET INPUT FILE BELOW
+inputfile=$1
+inputfile=ulyss12.txt
+
+indexlibadmin=../../indexlibadmin
+index=index
+
+rm -rf index
+mkdir index
+
+if test -z $inputfile; then
+ cat <<-END 1>&2
+ This test needs a large input file as a seed.
+
+ You might consider using http://www.gutenberg.org/ as a starting point to get a file.
+
+ Please edit this script ($0) to set the input file.
+END
+ exit 1
+fi
+
+rm -rf output
+mkdir output/
+
+rm -rf tmp
+mkdir tmp/
+
+python generate.py < $inputfile
+
+$indexlibadmin remove $index
+for t in output/text_*; do
+ $indexlibadmin add $index $t
+done
+
+
+for w in output/words_*.list; do
+ $indexlibadmin search $index "`cat $w`" >tmp/got 2>/dev/null
+ source output/`basename $w list`script
+ if ! diff -q tmp/got tmp/expected; then
+ cat <<-END
+ Pattern $w was wrong!
+
+ Diff:
+ END
+ diff -u tmp/got tmp/expected
+ echo "End of Diff."
+ exit 1
+ fi
+done
+
+rm -f tmp/got tmp/expected tmp/pat
+rmdir tmp
+
diff --git a/indexlib/tests/large-scale/generate.py b/indexlib/tests/large-scale/generate.py
new file mode 100644
index 000000000..3a66df3be
--- /dev/null
+++ b/indexlib/tests/large-scale/generate.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+import random
+import re
+
+def init_chain(infile):
+ chain = {}
+ last = ('','')
+ for line in infile:
+ for word in line.split():
+ if not chain.has_key(last):
+ chain[last]=[]
+ chain[last].append(word)
+ last=(last[1],word)
+ chain[last]=None
+ return chain
+
+def output(chain,length,outputfile):
+ last = ('','')
+ start=2000
+ for i in range(length+start):
+ if chain[last] is None:
+ break
+ word = random.choice(chain[last])
+ last=(last[1],word)
+ if i > start:
+ outputfile.write(word)
+ outputfile.write(' ')
+ outputfile.write("\n")
+
+def get_words(chain,nwords,outputfile,scriptfile):
+ scriptfile.write("(for f in output/text_*; echo $f) > tmp/so_far\n")
+ for i in range(nwords):
+ word='1'
+ while re.compile("\d").search(word):
+ word=random.choice(random.choice(chain.keys()))
+ word=re.sub(r'\W','',word)
+ outputfile.write(word+"\n")
+ scriptfile.write("grep -i -E -e '(\W|^)%s' -l output/text_* >tmp/part_%s\n" % (word,word))
+ scriptfile.write("perl -e '($file1, $file2) = @ARGV; open F2, $file2; while (<F2>) {$h2{$_}++}; open F1, $file1; while (<F1>) {if ($h2{$_}) {print $_; $h2{$_} = 0;}}' tmp/part_%s tmp/so_far >tmp/so_far_\n" % word) # From scriptome
+ scriptfile.write("mv tmp/so_far_ tmp/so_far\n")
+ scriptfile.write("rm tmp/part_%s\n" % word)
+ scriptfile.write("mv tmp/so_far tmp/expected\n")
+
+
+chain=init_chain(file("/dev/stdin"))
+for i in range(10000):
+ output(chain,2000,file("output/text_"+str(i+1),'w'))
+
+
+for i in range(1000):
+ get_words(chain,random.randint(1,5),file("output/words_%s.list"%str(i+1),'w'),file("output/words_%s.script"%str(i+1),'w'))