diff options
Diffstat (limited to 'indexlib/tests/large-scale')
-rwxr-xr-x | indexlib/tests/large-scale/do-test.zsh | 55 | ||||
-rw-r--r-- | indexlib/tests/large-scale/generate.py | 51 |
2 files changed, 106 insertions, 0 deletions
diff --git a/indexlib/tests/large-scale/do-test.zsh b/indexlib/tests/large-scale/do-test.zsh new file mode 100755 index 000000000..b8d47b45d --- /dev/null +++ b/indexlib/tests/large-scale/do-test.zsh @@ -0,0 +1,55 @@ +#!/usr/bin/env zsh + +# SET INPUT FILE BELOW +inputfile=$1 +inputfile=ulyss12.txt + +indexlibadmin=../../indexlibadmin +index=index + +rm -rf index +mkdir index + +if test -z $inputfile; then + cat <<-END 1>&2 + This test needs a large input file as a seed. + + You might consider using http://www.gutenberg.org/ as a starting point to get a file. + + Please edit this script ($0) to set the input file. +END + exit 1 +fi + +rm -rf output +mkdir output/ + +rm -rf tmp +mkdir tmp/ + +python generate.py < $inputfile + +$indexlibadmin remove $index +for t in output/text_*; do + $indexlibadmin add $index $t +done + + +for w in output/words_*.list; do + $indexlibadmin search $index "`cat $w`" >tmp/got 2>/dev/null + source output/`basename $w list`script + if ! diff -q tmp/got tmp/expected; then + cat <<-END + Pattern $w was wrong! + + Diff: + END + diff -u tmp/got tmp/expected + echo "End of Diff." + exit 1 + fi +done + +rm -f tmp/got tmp/expected tmp/pat +rmdir tmp + diff --git a/indexlib/tests/large-scale/generate.py b/indexlib/tests/large-scale/generate.py new file mode 100644 index 000000000..3a66df3be --- /dev/null +++ b/indexlib/tests/large-scale/generate.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +import random +import re + +def init_chain(infile): + chain = {} + last = ('','') + for line in infile: + for word in line.split(): + if not chain.has_key(last): + chain[last]=[] + chain[last].append(word) + last=(last[1],word) + chain[last]=None + return chain + +def output(chain,length,outputfile): + last = ('','') + start=2000 + for i in range(length+start): + if chain[last] is None: + break + word = random.choice(chain[last]) + last=(last[1],word) + if i > start: + outputfile.write(word) + outputfile.write(' ') + outputfile.write("\n") + +def get_words(chain,nwords,outputfile,scriptfile): + scriptfile.write("(for f in output/text_*; echo $f) > tmp/so_far\n") + for i in range(nwords): + word='1' + while re.compile("\d").search(word): + word=random.choice(random.choice(chain.keys())) + word=re.sub(r'\W','',word) + outputfile.write(word+"\n") + scriptfile.write("grep -i -E -e '(\W|^)%s' -l output/text_* >tmp/part_%s\n" % (word,word)) + scriptfile.write("perl -e '($file1, $file2) = @ARGV; open F2, $file2; while (<F2>) {$h2{$_}++}; open F1, $file1; while (<F1>) {if ($h2{$_}) {print $_; $h2{$_} = 0;}}' tmp/part_%s tmp/so_far >tmp/so_far_\n" % word) # From scriptome + scriptfile.write("mv tmp/so_far_ tmp/so_far\n") + scriptfile.write("rm tmp/part_%s\n" % word) + scriptfile.write("mv tmp/so_far tmp/expected\n") + + +chain=init_chain(file("/dev/stdin")) +for i in range(10000): + output(chain,2000,file("output/text_"+str(i+1),'w')) + + +for i in range(1000): + get_words(chain,random.randint(1,5),file("output/words_%s.list"%str(i+1),'w'),file("output/words_%s.script"%str(i+1),'w')) |