1: #!/usr/bin/perl -w
2:
3: =head1 NAME
4:
5: updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
6: B<$base> информации о вопросах. Использует Tie::Hash
7:
8: =head1 SYNOPSIS
9:
10: updateRS1.pl Questions_per_cicle cicles_number
11:
12: updateRS.pl Questions_per_cicle
13:
14: updateRS.pl
15:
16:
17: =head1 DESCRIPTION
18:
19: Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
20: добавляет информацию в таблицы word2question, nests, nf. Поле
21: ProcessedBySearch устанавливается в 1. Обрабатывает
22: Questions_per_cicle*cicles_number вопросов, сбрасывая
23: информацию в базу каждые Questions_per_cicle вопросов.
24: Значения по умолчанию:
25: Questions_per_cicle=500;
26: cicles_number=1.
27:
28:
29: =head1 AUTHOR
30:
31: Роман Семизаров
32:
33:
34: =cut
35:
36:
37:
38: use locale;
39: use DBI;
40: use POSIX qw (locale_h);
41: use lib "../lib";
42: use chgkfiles;
43: use Tie::Hash;
44: use dbchgk;
45: open (STDERR,">errors");
46: my $initime=time;
47: open TIME, ">time";
48: do "common.pl";
49: do "chgk.cnf";
50:
51:
52:
53: package NewStdHash;
54: require Tie::Hash;
55:
56: @ISA = (Tie::StdHash);
57:
58:
59: package main;
60:
61:
62:
63: $qlimit=shift||500;
64: $times=shift||1;
65:
66: die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
67:
68: require "check.pl";
69:
70: open (STDERR,">$stderr") if $stderr;
71:
72:
73: open (UNKNOWN,">$unknown");
74:
75:
76:
77: my $nf;
78:
79: #open WARN, ">$warnings";
80:
81: #%forbidden=checktable('equalto')? getequalto : ();
82:
83:
84: if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
85:
86: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
87:
88:
89:
90: print "Loading dictionaries\n";
91:
92: die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
93:
94: foreach $d(@dictionaries)
95: {
96: print "Loading $d\n";
97: open (DICT, $d) || print " Not found\n";
98: while ( <DICT> )
99: {
100: chomp;
101: s/\s*$//;
102: ($aa,$b)=split(/\//);
103: $aa=~tr/ёЁ/еЕ/;
104: $a= uc $aa;
105: $words{$a}.=$b || "!";
106: }
107: close(DICT);
108: }
109:
110: die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
111:
112:
113: print "Getting words...\n";
114:
115: print TIME "\t\t".(time-$initime)."\n";
116:
117: for my $commonI(1..$times)
118: {
119:
120:
121: print "loading nests\n";
122:
123: tie %nf, 'NewStdHash';
124:
125: %nf=getnests;
126:
127: %nfnumber=getnfnumbers;
128:
129: print "Getting words...\n";
130:
131: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
132: $sch=0;
133: while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
134: # берём по одному вопросу
135: # и вешаем в массив, индексы которого -- словоформы,
136: # а значения -- списки вопросов.
137: {
138:
139: print "$sch $id\n" ;#unless (++$sch % 1);
140: searchmark($id);
141: if ($forbidden{$id}) {next};
142: foreach $fieldnumber (0..$#ss) #перебираем поля
143: {
144: $text=$ss[$fieldnumber];
145: next unless $text;
146: $text=~tr/ёЁ/еЕ/;
147: $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
148: $text=~s/p(${RLrl})/р$1/gom;
149: $text=~s/(${RLrl})p/$1р/gom;
150: $text=~s/\s+/ /gmo;
151: @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
152:
153: foreach $wordnumber(0..$#list)
154: {
155: $word=uc $list[$wordnumber];
156: next if length $word>$maxwsize;
157: if (my $n=$nf{$word}||$newnf{$word})
158: {
159: @n= split ' ',$n;
160: $nfkvo{$_}++ foreach @n;
161: $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
162: foreach (@n);
163: }
164: else {
165: if ($word=~/^${RLrl}+$/o){ # Русское слово
166: # проанализировать по таблице аффиксов,
167: # проверить наличие начальных форм в
168: # nf, а если таких нет, то
169: # и по словарю.
170:
171: $nf=&checkit(uc $word,\%words);
172: if (!$nf) {
173: $nf=(uc $word)."/!";
174: print UNKNOWN "$nf\n" if $unknown;
175: }
176:
177: } else {# нерусское слово
178: $nf=(uc $word)."/!";
179: print UNKNOWN "$nf\n" if $unknown;
180: }
181:
182: foreach $n (split ' ', $nf)
183: {
184: ($f,$flag)=split '/', $n;
185: if ($nfnumber=$nfnumber{$f})
186: {
187: $newnf{$word}.=" $nfnumber";
188: $nfkvo{$nfnumber}++;
189: $a=\$neww2k{$nfnumber};
190: $$a.=packword($fieldnumber, $id,$wordnumber);
191: if (length $$a>100) {flushw2k($nfnumber)}
192: }
193: else
194: {
195: $nfnumber=addnf(0, $f, $flag,1);
196: $newnf{uc $word}.=" $nfnumber";
197: $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
198: }
199: }
200: }
201:
202: }
203: }
204:
205: }
206:
207:
208: print "Filling word2question...\n";
209:
210: foreach (keys %neww2k)
211: {
212: updateword2question($_,$neww2k{$_});
213: delete $neww2k{$_};
214: }
215:
216: %neww2k=();
217:
218: print "Filling nf...\n";
219: $sch=0;
220:
221: incnf($_,$nfkvo{$_}) foreach (keys %nfkvo);
222:
223: %nfkvo=();
224:
225: print "Filling nests...\n";
226: $sch=0;
227:
228:
229: foreach $w (keys %newnf)
230: {
231: print "$sch\n" unless (++$sch % 1000);
232: @nf=split ' ',$newnf{$w};
233: addnest($w,$_) foreach @nf;
234: }
235: print "$sch nests added\n";
236:
237: print TIME "$commonI: \t$sch ";
238: print TIME "\t".(time-$initime)."\n";
239: %newnf=();
240:
241: }
242:
243: sub flushw2k
244: {
245: my ($n)=@_;
246: updateword2question($n,$neww2k{$n});
247: delete $neww2k{$_};
248: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>