1: #!/usr/local/bin/perl -w
2:
3: =head1 NAME
4:
5: updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
6: B<$base> информации о вопросах
7:
8: =head1 SYNOPSIS
9:
10: updateRS1.pl Questions_per_cicle cicles_number
11:
12: updateRS.pl Questions_per_cicle
13:
14: updateRS.pl
15:
16:
17: =head1 DESCRIPTION
18:
19: Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
20: добавляет информацию в таблицы word2question, nests, nf. Поле
21: ProcessedBySearch устанавливается в 1. Обрабатывает
22: Questions_per_cicle*cicles_number вопросов, сбрасывая
23: информацию в базу каждые Questions_per_cicle вопросов.
24: Значения по умолчанию:
25: Questions_per_cicle=500;
26: cicles_number=1.
27:
28:
29: =head1 AUTHOR
30:
31: Роман Семизаров
32:
33:
34: =cut
35:
36:
37: use locale;
38: use DBI;
39: use POSIX qw (locale_h);
40: use chgkfiles;
41: use dbchgk;
42: open (STDERR,">errors");
43: my $initime=time;
44: open TIME, ">time";
45: do "common.pl";
46: do "chgk.cnf";
47:
48: $qlimit=shift||500;
49: $times=shift||1;
50:
51: die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
52:
53: require "check.pl";
54:
55: open (STDERR,">$stderr") if $stderr;
56:
57:
58: open (UNKNOWN,">$unknown");
59:
60:
61:
62: my $nf;
63:
64: #open WARN, ">$warnings";
65:
66: %forbidden=checktable('equalto')? getequalto : ();
67:
68:
69: if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
70:
71: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
72:
73:
74:
75: print "Loading dictionaries\n";
76:
77: die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
78:
79: foreach $d(@dictionaries)
80: {
81: print "Loading $d\n";
82: open (DICT, $d) || print " Not found\n";
83: while ( <DICT> )
84: {
85: chomp;
86: s/\s*$//;
87: ($aa,$b)=split(/\//);
88: $a= uc $aa;
89: $words{$a}.=$b || "!";
90: }
91: close(DICT);
92: }
93:
94: die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
95:
96:
97: print "Getting words...\n";
98:
99: print TIME "\t\t".(time-$initime)."\n";
100:
101: for my $commonI(1..$times)
102: {
103:
104:
105: print "loading nests\n";
106:
107: %nf=getnests;
108:
109: %nfnumber=getnfnumbers;
110:
111: print "Getting words...\n";
112:
113: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
114: $sch=0;
115: while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
116: # берём по одному вопросу
117: # и вешаем в массив, индексы которого -- словоформы,
118: # а значения -- списки вопросов.
119: {
120:
121: print "$sch $id\n" ;#unless (++$sch % 1);
122: searchmark($id);
123: if ($forbidden{$id}) {next};
124: foreach $fieldnumber (0..$#ss) #перебираем поля
125: {
126: $text=$ss[$fieldnumber];
127: next unless $text;
128: $text=~tr/ёЁ/еЕ/;
129: $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
130: $text=~s/p(${RLrl})/р$1/gom;
131: $text=~s/(${RLrl})p/$1р/gom;
132: $text=~s/\s+/ /gmo;
133: @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
134:
135: foreach $wordnumber(0..$#list)
136: {
137: $word=uc $list[$wordnumber];
138: next if length $word>$maxwsize;
139: if (my $n=$nf{$word}||$newnf{$word})
140: {
141: @n= split ' ',$n;
142: $nfkvo{$_}++ foreach @n;
143: $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
144: foreach (@n);
145: }
146: else {
147: if ($word=~/^${RLrl}+$/o){ # Русское слово
148: # проанализировать по таблице аффиксов,
149: # проверить наличие начальных форм в
150: # nf, а если таких нет, то
151: # и по словарю.
152:
153: $nf=&checkit(uc $word,\%words);
154: if (!$nf) {
155: $nf=(uc $word)."/!";
156: print UNKNOWN "$nf\n" if $unknown;
157: }
158:
159: } else {# нерусское слово
160: $nf=(uc $word)."/!";
161: print UNKNOWN "$nf\n" if $unknown;
162: }
163:
164: foreach $n (split ' ', $nf)
165: {
166: ($f,$flag)=split '/', $n;
167: if ($nfnumber=$nfnumber{$f})
168: {
169: $newnf{$word}.=" $nfnumber";
170: $nfkvo{$nfnumber}++;
171: $a=\$neww2k{$nfnumber};
172: $$a.=packword($fieldnumber, $id,$wordnumber);
173: if (length $$a>100) {flushw2k($nfnumber)}
174: }
175: else
176: {
177: $nfnumber=addnf(0, $f, $flag,1);
178: $newnf{uc $word}.=" $nfnumber";
179: $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
180: }
181: }
182: }
183:
184: }
185: }
186:
187: }
188:
189:
190: print "Filling word2question...\n";
191:
192: foreach (keys %neww2k)
193: {
194: updateword2question($_,$neww2k{$_});
195: delete $neww2k{$_};
196: }
197:
198: %neww2k=();
199:
200: print "Filling nf...\n";
201: $sch=0;
202:
203: incnf($_,$nfkvo{$_}) foreach (keys %nfkvo);
204:
205: %nfkvo=();
206:
207: print "Filling nests...\n";
208: $sch=0;
209:
210:
211: foreach $w (keys %newnf)
212: {
213: print "$sch\n" unless (++$sch % 1000);
214: @nf=split ' ',$newnf{$w};
215: addnest($w,$_) foreach @nf;
216: }
217: print "$sch nests added\n";
218:
219: print TIME "$commonI: \t$sch ";
220: print TIME "\t".(time-$initime)."\n";
221: %newnf=();
222:
223: }
224:
225: sub flushw2k
226: {
227: my ($n)=@_;
228: updateword2question($n,$neww2k{$n});
229: delete $neww2k{$_};
230: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>