Annotation of db/prgsrc/updateRS2.pl, revision 1.2
1.1 roma7 1: #!/usr/bin/perl -w
2:
3: =head1 NAME
4:
5: updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
6: B<$base> информации о вопросах. Использует Tie::Hash
7:
8: =head1 SYNOPSIS
9:
10: updateRS1.pl Questions_per_cicle cicles_number
11:
12: updateRS.pl Questions_per_cicle
13:
14: updateRS.pl
15:
16:
17: =head1 DESCRIPTION
18:
19: Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
20: добавляет информацию в таблицы word2question, nests, nf. Поле
21: ProcessedBySearch устанавливается в 1. Обрабатывает
22: Questions_per_cicle*cicles_number вопросов, сбрасывая
23: информацию в базу каждые Questions_per_cicle вопросов.
24: Значения по умолчанию:
25: Questions_per_cicle=500;
26: cicles_number=1.
27:
28:
29: =head1 AUTHOR
30:
31: Роман Семизаров
32:
33:
34: =cut
35:
36:
37:
38: use locale;
39: use DBI;
40: use POSIX qw (locale_h);
41: use lib "../lib";
42: use chgkfiles;
43: use Tie::Hash;
44: use dbchgk;
1.2 ! boris 45:
! 46: my $DUMPDIR = $ENV{DUMPDIR} || "../dump";
! 47: open (STDERR,">$DUMPDIR/errors");
1.1 roma7 48: my $initime=time;
1.2 ! boris 49: open TIME, ">$DUMPDIR/time";
1.1 roma7 50: do "common.pl";
51: do "chgk.cnf";
52:
53:
54:
55: package NewStdHash;
56: require Tie::Hash;
57:
58: @ISA = (Tie::StdHash);
59:
60:
61: package main;
62:
63:
64:
65: $qlimit=shift||500;
66: $times=shift||1;
67:
68: die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
69:
70: require "check.pl";
71:
72: open (STDERR,">$stderr") if $stderr;
73:
74:
75: open (UNKNOWN,">$unknown");
76:
77:
78:
79: my $nf;
80:
81: #open WARN, ">$warnings";
82:
83: #%forbidden=checktable('equalto')? getequalto : ();
84:
85:
86: if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
87:
88: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
89:
90:
91:
92: print "Loading dictionaries\n";
93:
94: die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
95:
96: foreach $d(@dictionaries)
97: {
98: print "Loading $d\n";
99: open (DICT, $d) || print " Not found\n";
100: while ( <DICT> )
101: {
102: chomp;
103: s/\s*$//;
104: ($aa,$b)=split(/\//);
105: $aa=~tr/ёЁ/еЕ/;
106: $a= uc $aa;
107: $words{$a}.=$b || "!";
108: }
109: close(DICT);
110: }
111:
112: die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
113:
114:
115: print "Getting words...\n";
116:
117: print TIME "\t\t".(time-$initime)."\n";
118:
119: for my $commonI(1..$times)
120: {
121:
122:
123: print "loading nests\n";
124:
125: tie %nf, 'NewStdHash';
126:
127: %nf=getnests;
128:
129: %nfnumber=getnfnumbers;
130:
131: print "Getting words...\n";
132:
133: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
134: $sch=0;
135: while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
136: # берём по одному вопросу
137: # и вешаем в массив, индексы которого -- словоформы,
138: # а значения -- списки вопросов.
139: {
140:
141: print "$sch $id\n" ;#unless (++$sch % 1);
142: searchmark($id);
143: if ($forbidden{$id}) {next};
144: foreach $fieldnumber (0..$#ss) #перебираем поля
145: {
146: $text=$ss[$fieldnumber];
147: next unless $text;
148: $text=~tr/ёЁ/еЕ/;
149: $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
150: $text=~s/p(${RLrl})/р$1/gom;
151: $text=~s/(${RLrl})p/$1р/gom;
152: $text=~s/\s+/ /gmo;
153: @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
154:
155: foreach $wordnumber(0..$#list)
156: {
157: $word=uc $list[$wordnumber];
158: next if length $word>$maxwsize;
159: if (my $n=$nf{$word}||$newnf{$word})
160: {
161: @n= split ' ',$n;
162: $nfkvo{$_}++ foreach @n;
163: $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
164: foreach (@n);
165: }
166: else {
167: if ($word=~/^${RLrl}+$/o){ # Русское слово
168: # проанализировать по таблице аффиксов,
169: # проверить наличие начальных форм в
170: # nf, а если таких нет, то
171: # и по словарю.
172:
173: $nf=&checkit(uc $word,\%words);
174: if (!$nf) {
175: $nf=(uc $word)."/!";
176: print UNKNOWN "$nf\n" if $unknown;
177: }
178:
179: } else {# нерусское слово
180: $nf=(uc $word)."/!";
181: print UNKNOWN "$nf\n" if $unknown;
182: }
183:
184: foreach $n (split ' ', $nf)
185: {
186: ($f,$flag)=split '/', $n;
187: if ($nfnumber=$nfnumber{$f})
188: {
189: $newnf{$word}.=" $nfnumber";
190: $nfkvo{$nfnumber}++;
191: $a=\$neww2k{$nfnumber};
192: $$a.=packword($fieldnumber, $id,$wordnumber);
193: if (length $$a>100) {flushw2k($nfnumber)}
194: }
195: else
196: {
197: $nfnumber=addnf(0, $f, $flag,1);
198: $newnf{uc $word}.=" $nfnumber";
199: $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
200: }
201: }
202: }
203:
204: }
205: }
206:
207: }
208:
209:
210: print "Filling word2question...\n";
211:
212: foreach (keys %neww2k)
213: {
214: updateword2question($_,$neww2k{$_});
215: delete $neww2k{$_};
216: }
217:
218: %neww2k=();
219:
220: print "Filling nf...\n";
221: $sch=0;
222:
223: incnf($_,$nfkvo{$_}) foreach (keys %nfkvo);
224:
225: %nfkvo=();
226:
227: print "Filling nests...\n";
228: $sch=0;
229:
230:
231: foreach $w (keys %newnf)
232: {
233: print "$sch\n" unless (++$sch % 1000);
234: @nf=split ' ',$newnf{$w};
235: addnest($w,$_) foreach @nf;
236: }
237: print "$sch nests added\n";
238:
239: print TIME "$commonI: \t$sch ";
240: print TIME "\t".(time-$initime)."\n";
241: %newnf=();
242:
243: }
244:
245: sub flushw2k
246: {
247: my ($n)=@_;
248: updateword2question($n,$neww2k{$n});
249: delete $neww2k{$_};
250: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>