Annotation of db/prgsrc/updateRS2.pl, revision 1.1
1.1 ! roma7 1: #!/usr/bin/perl -w
! 2:
! 3: =head1 NAME
! 4:
! 5: updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
! 6: B<$base> информации о вопросах. Использует Tie::Hash
! 7:
! 8: =head1 SYNOPSIS
! 9:
! 10: updateRS1.pl Questions_per_cicle cicles_number
! 11:
! 12: updateRS.pl Questions_per_cicle
! 13:
! 14: updateRS.pl
! 15:
! 16:
! 17: =head1 DESCRIPTION
! 18:
! 19: Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
! 20: добавляет информацию в таблицы word2question, nests, nf. Поле
! 21: ProcessedBySearch устанавливается в 1. Обрабатывает
! 22: Questions_per_cicle*cicles_number вопросов, сбрасывая
! 23: информацию в базу каждые Questions_per_cicle вопросов.
! 24: Значения по умолчанию:
! 25: Questions_per_cicle=500;
! 26: cicles_number=1.
! 27:
! 28:
! 29: =head1 AUTHOR
! 30:
! 31: Роман Семизаров
! 32:
! 33:
! 34: =cut
! 35:
! 36:
! 37:
! 38: use locale;
! 39: use DBI;
! 40: use POSIX qw (locale_h);
! 41: use lib "../lib";
! 42: use chgkfiles;
! 43: use Tie::Hash;
! 44: use dbchgk;
! 45: open (STDERR,">errors");
! 46: my $initime=time;
! 47: open TIME, ">time";
! 48: do "common.pl";
! 49: do "chgk.cnf";
! 50:
! 51:
! 52:
! 53: package NewStdHash;
! 54: require Tie::Hash;
! 55:
! 56: @ISA = (Tie::StdHash);
! 57:
! 58:
! 59: package main;
! 60:
! 61:
! 62:
! 63: $qlimit=shift||500;
! 64: $times=shift||1;
! 65:
! 66: die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
! 67:
! 68: require "check.pl";
! 69:
! 70: open (STDERR,">$stderr") if $stderr;
! 71:
! 72:
! 73: open (UNKNOWN,">$unknown");
! 74:
! 75:
! 76:
! 77: my $nf;
! 78:
! 79: #open WARN, ">$warnings";
! 80:
! 81: #%forbidden=checktable('equalto')? getequalto : ();
! 82:
! 83:
! 84: if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
! 85:
! 86: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
! 87:
! 88:
! 89:
! 90: print "Loading dictionaries\n";
! 91:
! 92: die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
! 93:
! 94: foreach $d(@dictionaries)
! 95: {
! 96: print "Loading $d\n";
! 97: open (DICT, $d) || print " Not found\n";
! 98: while ( <DICT> )
! 99: {
! 100: chomp;
! 101: s/\s*$//;
! 102: ($aa,$b)=split(/\//);
! 103: $aa=~tr/ёЁ/еЕ/;
! 104: $a= uc $aa;
! 105: $words{$a}.=$b || "!";
! 106: }
! 107: close(DICT);
! 108: }
! 109:
! 110: die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
! 111:
! 112:
! 113: print "Getting words...\n";
! 114:
! 115: print TIME "\t\t".(time-$initime)."\n";
! 116:
! 117: for my $commonI(1..$times)
! 118: {
! 119:
! 120:
! 121: print "loading nests\n";
! 122:
! 123: tie %nf, 'NewStdHash';
! 124:
! 125: %nf=getnests;
! 126:
! 127: %nfnumber=getnfnumbers;
! 128:
! 129: print "Getting words...\n";
! 130:
! 131: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
! 132: $sch=0;
! 133: while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
! 134: # берём по одному вопросу
! 135: # и вешаем в массив, индексы которого -- словоформы,
! 136: # а значения -- списки вопросов.
! 137: {
! 138:
! 139: print "$sch $id\n" ;#unless (++$sch % 1);
! 140: searchmark($id);
! 141: if ($forbidden{$id}) {next};
! 142: foreach $fieldnumber (0..$#ss) #перебираем поля
! 143: {
! 144: $text=$ss[$fieldnumber];
! 145: next unless $text;
! 146: $text=~tr/ёЁ/еЕ/;
! 147: $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
! 148: $text=~s/p(${RLrl})/р$1/gom;
! 149: $text=~s/(${RLrl})p/$1р/gom;
! 150: $text=~s/\s+/ /gmo;
! 151: @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
! 152:
! 153: foreach $wordnumber(0..$#list)
! 154: {
! 155: $word=uc $list[$wordnumber];
! 156: next if length $word>$maxwsize;
! 157: if (my $n=$nf{$word}||$newnf{$word})
! 158: {
! 159: @n= split ' ',$n;
! 160: $nfkvo{$_}++ foreach @n;
! 161: $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
! 162: foreach (@n);
! 163: }
! 164: else {
! 165: if ($word=~/^${RLrl}+$/o){ # Русское слово
! 166: # проанализировать по таблице аффиксов,
! 167: # проверить наличие начальных форм в
! 168: # nf, а если таких нет, то
! 169: # и по словарю.
! 170:
! 171: $nf=&checkit(uc $word,\%words);
! 172: if (!$nf) {
! 173: $nf=(uc $word)."/!";
! 174: print UNKNOWN "$nf\n" if $unknown;
! 175: }
! 176:
! 177: } else {# нерусское слово
! 178: $nf=(uc $word)."/!";
! 179: print UNKNOWN "$nf\n" if $unknown;
! 180: }
! 181:
! 182: foreach $n (split ' ', $nf)
! 183: {
! 184: ($f,$flag)=split '/', $n;
! 185: if ($nfnumber=$nfnumber{$f})
! 186: {
! 187: $newnf{$word}.=" $nfnumber";
! 188: $nfkvo{$nfnumber}++;
! 189: $a=\$neww2k{$nfnumber};
! 190: $$a.=packword($fieldnumber, $id,$wordnumber);
! 191: if (length $$a>100) {flushw2k($nfnumber)}
! 192: }
! 193: else
! 194: {
! 195: $nfnumber=addnf(0, $f, $flag,1);
! 196: $newnf{uc $word}.=" $nfnumber";
! 197: $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
! 198: }
! 199: }
! 200: }
! 201:
! 202: }
! 203: }
! 204:
! 205: }
! 206:
! 207:
! 208: print "Filling word2question...\n";
! 209:
! 210: foreach (keys %neww2k)
! 211: {
! 212: updateword2question($_,$neww2k{$_});
! 213: delete $neww2k{$_};
! 214: }
! 215:
! 216: %neww2k=();
! 217:
! 218: print "Filling nf...\n";
! 219: $sch=0;
! 220:
! 221: incnf($_,$nfkvo{$_}) foreach (keys %nfkvo);
! 222:
! 223: %nfkvo=();
! 224:
! 225: print "Filling nests...\n";
! 226: $sch=0;
! 227:
! 228:
! 229: foreach $w (keys %newnf)
! 230: {
! 231: print "$sch\n" unless (++$sch % 1000);
! 232: @nf=split ' ',$newnf{$w};
! 233: addnest($w,$_) foreach @nf;
! 234: }
! 235: print "$sch nests added\n";
! 236:
! 237: print TIME "$commonI: \t$sch ";
! 238: print TIME "\t".(time-$initime)."\n";
! 239: %newnf=();
! 240:
! 241: }
! 242:
! 243: sub flushw2k
! 244: {
! 245: my ($n)=@_;
! 246: updateword2question($n,$neww2k{$n});
! 247: delete $neww2k{$_};
! 248: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>