Annotation of db/prgsrc/updateRS1.pl, revision 1.1
1.1 ! boris 1: #!/usr/local/bin/perl -w
! 2:
! 3: =head1 NAME
! 4:
! 5: updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
! 6: B<$base> информации о вопросах
! 7:
! 8: =head1 SYNOPSIS
! 9:
! 10: updateRS1.pl Questions_per_cicle cicles_number
! 11:
! 12: updateRS.pl Questions_per_cicle
! 13:
! 14: updateRS.pl
! 15:
! 16:
! 17: =head1 DESCRIPTION
! 18:
! 19: Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
! 20: добавляет информацию в таблицы word2question, nests, nf. Поле
! 21: ProcessedBySearch устанавливается в 1. Обрабатывает
! 22: Questions_per_cicle*cicles_number вопросов, сбрасывая
! 23: информацию в базу каждые Questions_per_cicle вопросов.
! 24: Значения по умолчанию:
! 25: Questions_per_cicle=500;
! 26: cicles_number=1.
! 27:
! 28:
! 29: =head1 AUTHOR
! 30:
! 31: Роман Семизаров
! 32:
! 33:
! 34: =cut
! 35:
! 36:
! 37: use locale;
! 38: use DBI;
! 39: use POSIX qw (locale_h);
! 40: use chgkfiles;
! 41: use dbchgk;
! 42: open (STDERR,">errors");
! 43: my $initime=time;
! 44: open TIME, ">time";
! 45: do "common.pl";
! 46: do "chgk.cnf";
! 47:
! 48: $qlimit=shift||500;
! 49: $times=shift||1;
! 50:
! 51: die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
! 52:
! 53: require "check.pl";
! 54:
! 55: open (STDERR,">$stderr") if $stderr;
! 56:
! 57:
! 58: open (UNKNOWN,">$unknown");
! 59:
! 60:
! 61:
! 62: my $nf;
! 63:
! 64: #open WARN, ">$warnings";
! 65:
! 66: %forbidden=checktable('equalto')? getequalto : ();
! 67:
! 68:
! 69: if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
! 70:
! 71: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
! 72:
! 73:
! 74:
! 75: print "Loading dictionaries\n";
! 76:
! 77: die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
! 78:
! 79: foreach $d(@dictionaries)
! 80: {
! 81: print "Loading $d\n";
! 82: open (DICT, $d) || print " Not found\n";
! 83: while ( <DICT> )
! 84: {
! 85: chomp;
! 86: s/\s*$//;
! 87: ($aa,$b)=split(/\//);
! 88: $a= uc $aa;
! 89: $words{$a}.=$b || "!";
! 90: }
! 91: close(DICT);
! 92: }
! 93:
! 94: die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
! 95:
! 96:
! 97: print "Getting words...\n";
! 98:
! 99: print TIME "\t\t".(time-$initime)."\n";
! 100:
! 101: for my $commonI(1..$times)
! 102: {
! 103:
! 104:
! 105: print "loading nests\n";
! 106:
! 107: %nf=getnests;
! 108:
! 109: %nfnumber=getnfnumbers;
! 110:
! 111: print "Getting words...\n";
! 112:
! 113: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
! 114: $sch=0;
! 115: while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
! 116: # берём по одному вопросу
! 117: # и вешаем в массив, индексы которого -- словоформы,
! 118: # а значения -- списки вопросов.
! 119: {
! 120:
! 121: print "$sch $id\n" ;#unless (++$sch % 1);
! 122: searchmark($id);
! 123: if ($forbidden{$id}) {next};
! 124: foreach $fieldnumber (0..$#ss) #перебираем поля
! 125: {
! 126: $text=$ss[$fieldnumber];
! 127: next unless $text;
! 128: $text=~tr/ёЁ/еЕ/;
! 129: $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
! 130: $text=~s/p(${RLrl})/р$1/gom;
! 131: $text=~s/(${RLrl})p/$1р/gom;
! 132: $text=~s/\s+/ /gmo;
! 133: @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
! 134:
! 135: foreach $wordnumber(0..$#list)
! 136: {
! 137: $word=uc $list[$wordnumber];
! 138: next if length $word>$maxwsize;
! 139: if (my $n=$nf{$word}||$newnf{$word})
! 140: {
! 141: @n= split ' ',$n;
! 142: $nfkvo{$_}++ foreach @n;
! 143: $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
! 144: foreach (@n);
! 145: }
! 146: else {
! 147: if ($word=~/^${RLrl}+$/o){ # Русское слово
! 148: # проанализировать по таблице аффиксов,
! 149: # проверить наличие начальных форм в
! 150: # nf, а если таких нет, то
! 151: # и по словарю.
! 152:
! 153: $nf=&checkit(uc $word,\%words);
! 154: if (!$nf) {
! 155: $nf=(uc $word)."/!";
! 156: print UNKNOWN "$nf\n" if $unknown;
! 157: }
! 158:
! 159: } else {# нерусское слово
! 160: $nf=(uc $word)."/!";
! 161: print UNKNOWN "$nf\n" if $unknown;
! 162: }
! 163:
! 164: foreach $n (split ' ', $nf)
! 165: {
! 166: ($f,$flag)=split '/', $n;
! 167: if ($nfnumber=$nfnumber{$f})
! 168: {
! 169: $newnf{$word}.=" $nfnumber";
! 170: $nfkvo{$nfnumber}++;
! 171: $a=\$neww2k{$nfnumber};
! 172: $$a.=packword($fieldnumber, $id,$wordnumber);
! 173: if (length $$a>100) {flushw2k($nfnumber)}
! 174: }
! 175: else
! 176: {
! 177: $nfnumber=addnf(0, $f, $flag,1);
! 178: $newnf{uc $word}.=" $nfnumber";
! 179: $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
! 180: }
! 181: }
! 182: }
! 183:
! 184: }
! 185: }
! 186:
! 187: }
! 188:
! 189:
! 190: print "Filling word2question...\n";
! 191:
! 192: foreach (keys %neww2k)
! 193: {
! 194: updateword2question($_,$neww2k{$_});
! 195: delete $neww2k{$_};
! 196: }
! 197:
! 198: %neww2k=();
! 199:
! 200: print "Filling nf...\n";
! 201: $sch=0;
! 202:
! 203: incnf($_,$nfkvo{$_}) foreach (keys %nfkvo);
! 204:
! 205: %nfkvo=();
! 206:
! 207: print "Filling nests...\n";
! 208: $sch=0;
! 209:
! 210:
! 211: foreach $w (keys %newnf)
! 212: {
! 213: print "$sch\n" unless (++$sch % 1000);
! 214: @nf=split ' ',$newnf{$w};
! 215: addnest($w,$_) foreach @nf;
! 216: }
! 217: print "$sch nests added\n";
! 218:
! 219: print TIME "$commonI: \t$sch ";
! 220: print TIME "\t".(time-$initime)."\n";
! 221: %newnf=();
! 222:
! 223: }
! 224:
! 225: sub flushw2k
! 226: {
! 227: my ($n)=@_;
! 228: updateword2question($n,$neww2k{$n});
! 229: delete $neww2k{$_};
! 230: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>