Annotation of db/prgsrc/updateRS1.pl, revision 1.10
1.2 boris 1: #!/usr/bin/perl -w
2:
3: =head1 NAME
4:
5: updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
6: B<$base> информации о вопросах
7:
8: =head1 SYNOPSIS
9:
10: updateRS1.pl Questions_per_cicle cicles_number
11:
12: updateRS.pl Questions_per_cicle
13:
14: updateRS.pl
15:
16:
17: =head1 DESCRIPTION
18:
19: Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
20: добавляет информацию в таблицы word2question, nests, nf. Поле
21: ProcessedBySearch устанавливается в 1. Обрабатывает
22: Questions_per_cicle*cicles_number вопросов, сбрасывая
23: информацию в базу каждые Questions_per_cicle вопросов.
24: Значения по умолчанию:
25: Questions_per_cicle=500;
26: cicles_number=1.
27:
28:
29: =head1 AUTHOR
30:
31: Роман Семизаров
32:
33:
34: =cut
35:
36:
37: use locale;
38: use DBI;
39: use POSIX qw (locale_h);
1.3 boris 40: use lib "../lib";
1.2 boris 41: use chgkfiles;
42: use dbchgk;
1.10 ! boris 43: my $DUMPDIR = $ENV{DUMPDIR} || "../dump";
! 44:
! 45: open (STDERR,">$DUMPDIR/errors");
1.2 boris 46: my $initime=time;
1.10 ! boris 47: open TIME, ">$DUMPDIR/time";
1.2 boris 48: do "common.pl";
49: do "chgk.cnf";
50:
51: $qlimit=shift||500;
52: $times=shift||1;
53:
54: die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
55:
56: require "check.pl";
57:
58: open (STDERR,">$stderr") if $stderr;
59:
60:
61: open (UNKNOWN,">$unknown");
62:
63:
64:
65: my $nf;
66:
67: #open WARN, ">$warnings";
68:
1.5 roma7 69: #%forbidden=checktable('equalto')? getequalto : ();
1.2 boris 70:
71:
72: if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
73:
74: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
75:
76:
77:
78: print "Loading dictionaries\n";
79:
80: die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
81:
82: foreach $d(@dictionaries)
83: {
84: print "Loading $d\n";
85: open (DICT, $d) || print " Not found\n";
86: while ( <DICT> )
87: {
88: chomp;
89: s/\s*$//;
90: ($aa,$b)=split(/\//);
1.4 roma7 91: $aa=~tr/ёЁ/еЕ/;
1.2 boris 92: $a= uc $aa;
93: $words{$a}.=$b || "!";
94: }
95: close(DICT);
96: }
97:
98: die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
99:
100:
101: print "Getting words...\n";
102:
103: print TIME "\t\t".(time-$initime)."\n";
1.9 roma7 104: my $broken=0;
1.2 boris 105:
106: for my $commonI(1..$times)
107: {
1.9 roma7 108: last if $broken;
1.2 boris 109:
110: print "loading nests\n";
111:
112: %nf=getnests;
113:
114: %nfnumber=getnfnumbers;
115:
116: print "Getting words...\n";
117:
118: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
119: $sch=0;
120: while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
121: # берём по одному вопросу
122: # и вешаем в массив, индексы которого -- словоформы,
123: # а значения -- списки вопросов.
124: {
1.9 roma7 125: if (-e "RS_stop") {unlink("RS_stop"); $broken=1;last}
126: print "$sch $id\n" unless ($sch % 10);#unless (++$sch % 1);
1.2 boris 127: searchmark($id);
128: if ($forbidden{$id}) {next};
129: foreach $fieldnumber (0..$#ss) #перебираем поля
130: {
131: $text=$ss[$fieldnumber];
132: next unless $text;
133: $text=~tr/ёЁ/еЕ/;
134: $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
135: $text=~s/p(${RLrl})/р$1/gom;
136: $text=~s/(${RLrl})p/$1р/gom;
137: $text=~s/\s+/ /gmo;
138: @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
139:
140: foreach $wordnumber(0..$#list)
141: {
142: $word=uc $list[$wordnumber];
143: next if length $word>$maxwsize;
144: if (my $n=$nf{$word}||$newnf{$word})
145: {
146: @n= split ' ',$n;
147: $nfkvo{$_}++ foreach @n;
1.6 roma7 148: my $r=packword($fieldnumber, $id,$wordnumber%256);
149: $neww2k{$_}.=$r foreach (@n);
1.2 boris 150: }
151: else {
152: if ($word=~/^${RLrl}+$/o){ # Русское слово
153: # проанализировать по таблице аффиксов,
154: # проверить наличие начальных форм в
155: # nf, а если таких нет, то
156: # и по словарю.
157:
158: $nf=&checkit(uc $word,\%words);
159: if (!$nf) {
160: $nf=(uc $word)."/!";
161: print UNKNOWN "$nf\n" if $unknown;
162: }
163:
164: } else {# нерусское слово
165: $nf=(uc $word)."/!";
166: print UNKNOWN "$nf\n" if $unknown;
167: }
168:
169: foreach $n (split ' ', $nf)
170: {
171: ($f,$flag)=split '/', $n;
172: if ($nfnumber=$nfnumber{$f})
173: {
174: $newnf{$word}.=" $nfnumber";
175: $nfkvo{$nfnumber}++;
176: $a=\$neww2k{$nfnumber};
1.6 roma7 177: my $r=packword($fieldnumber, $id,$wordnumber);
178: $$a.=$r;
1.2 boris 179: if (length $$a>100) {flushw2k($nfnumber)}
180: }
181: else
182: {
183: $nfnumber=addnf(0, $f, $flag,1);
184: $newnf{uc $word}.=" $nfnumber";
1.6 roma7 185: my $r=packword($fieldnumber, $id,$wordnumber);
186: $neww2k{$nfnumber}.=$r;
1.2 boris 187: }
188: }
189: }
190:
191: }
192: }
193:
194: }
195:
196:
197: print "Filling word2question...\n";
198:
199: foreach (keys %neww2k)
200: {
201: updateword2question($_,$neww2k{$_});
202: delete $neww2k{$_};
203: }
204:
205: %neww2k=();
206:
207: print "Filling nf...\n";
208: $sch=0;
209:
210: incnf($_,$nfkvo{$_}) foreach (keys %nfkvo);
211:
212: %nfkvo=();
213:
214: print "Filling nests...\n";
215: $sch=0;
216:
217:
218: foreach $w (keys %newnf)
219: {
220: print "$sch\n" unless (++$sch % 1000);
221: @nf=split ' ',$newnf{$w};
222: addnest($w,$_) foreach @nf;
223: }
224: print "$sch nests added\n";
225:
226: print TIME "$commonI: \t$sch ";
227: print TIME "\t".(time-$initime)."\n";
228: %newnf=();
229:
230: }
231:
232: sub flushw2k
233: {
234: my ($n)=@_;
235: updateword2question($n,$neww2k{$n});
236: delete $neww2k{$_};
237: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>