version 1.1, 2001/10/31 03:00:10
|
version 1.7, 2002/06/15 03:43:48
|
Line 1
|
Line 1
|
#!/usr/local/bin/perl -w
|
#!/usr/bin/perl -w |
|
|
=head1 NAME
|
=head1 NAME |
|
|
updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
|
updateRS.pl - Скрипт для занесения в таблицы русского поиска базы |
B<$base> информации о вопросах
|
B<$base> информации о вопросах |
|
|
=head1 SYNOPSIS
|
=head1 SYNOPSIS |
|
|
updateRS1.pl Questions_per_cicle cicles_number
|
updateRS1.pl Questions_per_cicle cicles_number |
|
|
updateRS.pl Questions_per_cicle
|
updateRS.pl Questions_per_cicle |
|
|
updateRS.pl
|
updateRS.pl |
|
|
|
|
=head1 DESCRIPTION
|
=head1 DESCRIPTION |
|
|
Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
|
Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch, |
добавляет информацию в таблицы word2question, nests, nf. Поле
|
добавляет информацию в таблицы word2question, nests, nf. Поле |
ProcessedBySearch устанавливается в 1. Обрабатывает
|
ProcessedBySearch устанавливается в 1. Обрабатывает |
Questions_per_cicle*cicles_number вопросов, сбрасывая
|
Questions_per_cicle*cicles_number вопросов, сбрасывая |
информацию в базу каждые Questions_per_cicle вопросов.
|
информацию в базу каждые Questions_per_cicle вопросов. |
Значения по умолчанию:
|
Значения по умолчанию: |
Questions_per_cicle=500;
|
Questions_per_cicle=500; |
cicles_number=1.
|
cicles_number=1. |
|
|
|
|
=head1 AUTHOR
|
=head1 AUTHOR |
|
|
Роман Семизаров
|
Роман Семизаров |
|
|
|
|
=cut
|
=cut |
|
|
|
|
use locale;
|
use locale; |
use DBI;
|
use DBI; |
use POSIX qw (locale_h);
|
use POSIX qw (locale_h); |
use chgkfiles;
|
use lib "../lib"; |
use dbchgk;
|
use chgkfiles; |
open (STDERR,">errors");
|
use dbchgk; |
my $initime=time;
|
open (STDERR,">errors"); |
open TIME, ">time";
|
my $initime=time; |
do "common.pl";
|
open TIME, ">time"; |
do "chgk.cnf";
|
do "common.pl"; |
|
do "chgk.cnf"; |
$qlimit=shift||500;
|
|
$times=shift||1;
|
$qlimit=shift||500; |
|
$times=shift||1; |
die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
|
|
|
die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize; |
require "check.pl";
|
|
|
require "check.pl"; |
open (STDERR,">$stderr") if $stderr;
|
|
|
open (STDERR,">$stderr") if $stderr; |
|
|
open (UNKNOWN,">$unknown");
|
|
|
open (UNKNOWN,">$unknown"); |
|
|
|
|
my $nf;
|
|
|
my $nf; |
#open WARN, ">$warnings";
|
|
|
#open WARN, ">$warnings"; |
%forbidden=checktable('equalto')? getequalto : ();
|
|
|
#%forbidden=checktable('equalto')? getequalto : (); |
|
|
if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
|
|
|
if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"}; |
getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
|
|
|
getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL"); |
|
|
|
|
print "Loading dictionaries\n";
|
|
|
print "Loading dictionaries\n"; |
die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
|
|
|
die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries; |
foreach $d(@dictionaries)
|
|
{
|
foreach $d(@dictionaries) |
print "Loading $d\n";
|
{ |
open (DICT, $d) || print " Not found\n";
|
print "Loading $d\n"; |
while ( <DICT> )
|
open (DICT, $d) || print " Not found\n"; |
{
|
while ( <DICT> ) |
chomp;
|
{ |
s/\s*$//;
|
chomp; |
($aa,$b)=split(/\//);
|
s/\s*$//; |
$a= uc $aa;
|
($aa,$b)=split(/\//); |
$words{$a}.=$b || "!";
|
$aa=~tr/ёЁ/еЕ/; |
}
|
$a= uc $aa; |
close(DICT);
|
$words{$a}.=$b || "!"; |
}
|
} |
|
close(DICT); |
die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
|
} |
|
|
|
die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words; |
print "Getting words...\n";
|
|
|
|
print TIME "\t\t".(time-$initime)."\n";
|
print "Getting words...\n"; |
|
|
for my $commonI(1..$times)
|
print TIME "\t\t".(time-$initime)."\n"; |
{
|
|
|
for my $commonI(1..$times) |
|
{ |
print "loading nests\n";
|
|
|
|
%nf=getnests;
|
print "loading nests\n"; |
|
|
%nfnumber=getnfnumbers;
|
%nf=getnests; |
|
|
print "Getting words...\n";
|
%nfnumber=getnfnumbers; |
|
|
getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
|
print "Getting words...\n"; |
$sch=0;
|
|
while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
|
getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL"); |
# берём по одному вопросу
|
$sch=0; |
# и вешаем в массив, индексы которого -- словоформы,
|
while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id)) |
# а значения -- списки вопросов.
|
# берём по одному вопросу |
{
|
# и вешаем в массив, индексы которого -- словоформы, |
|
# а значения -- списки вопросов. |
print "$sch $id\n" ;#unless (++$sch % 1);
|
{ |
searchmark($id);
|
|
if ($forbidden{$id}) {next};
|
print "$sch $id\n" ;#unless (++$sch % 1); |
foreach $fieldnumber (0..$#ss) #перебираем поля
|
searchmark($id); |
{
|
if ($forbidden{$id}) {next}; |
$text=$ss[$fieldnumber];
|
foreach $fieldnumber (0..$#ss) #перебираем поля |
next unless $text;
|
{ |
$text=~tr/ёЁ/еЕ/;
|
$text=$ss[$fieldnumber]; |
$text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
|
next unless $text; |
$text=~s/p(${RLrl})/р$1/gom;
|
$text=~tr/ёЁ/еЕ/; |
$text=~s/(${RLrl})p/$1р/gom;
|
$text=~s/(${RLrl})p(${RLrl})/$1p$2/gom; |
$text=~s/\s+/ /gmo;
|
$text=~s/p(${RLrl})/р$1/gom; |
@list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
|
$text=~s/(${RLrl})p/$1р/gom; |
|
$text=~s/\s+/ /gmo; |
foreach $wordnumber(0..$#list)
|
@list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom; |
{
|
|
$word=uc $list[$wordnumber];
|
foreach $wordnumber(0..$#list) |
next if length $word>$maxwsize;
|
{ |
if (my $n=$nf{$word}||$newnf{$word})
|
$word=uc $list[$wordnumber]; |
{
|
next if length $word>$maxwsize; |
@n= split ' ',$n;
|
if (my $n=$nf{$word}||$newnf{$word}) |
$nfkvo{$_}++ foreach @n;
|
{ |
$neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
|
@n= split ' ',$n; |
foreach (@n);
|
$nfkvo{$_}++ foreach @n; |
}
|
my $r=packword($fieldnumber, $id,$wordnumber%256); |
else {
|
$neww2k{$_}.=$r foreach (@n); |
if ($word=~/^${RLrl}+$/o){ # Русское слово
|
} |
# проанализировать по таблице аффиксов,
|
else { |
# проверить наличие начальных форм в
|
if ($word=~/^${RLrl}+$/o){ # Русское слово |
# nf, а если таких нет, то
|
# проанализировать по таблице аффиксов, |
# и по словарю.
|
# проверить наличие начальных форм в |
|
# nf, а если таких нет, то |
$nf=&checkit(uc $word,\%words);
|
# и по словарю. |
if (!$nf) {
|
|
$nf=(uc $word)."/!";
|
$nf=&checkit(uc $word,\%words); |
print UNKNOWN "$nf\n" if $unknown;
|
if (!$nf) { |
}
|
$nf=(uc $word)."/!"; |
|
print UNKNOWN "$nf\n" if $unknown; |
} else {# нерусское слово
|
} |
$nf=(uc $word)."/!";
|
|
print UNKNOWN "$nf\n" if $unknown;
|
} else {# нерусское слово |
}
|
$nf=(uc $word)."/!"; |
|
print UNKNOWN "$nf\n" if $unknown; |
foreach $n (split ' ', $nf)
|
} |
{
|
|
($f,$flag)=split '/', $n;
|
foreach $n (split ' ', $nf) |
if ($nfnumber=$nfnumber{$f})
|
{ |
{
|
($f,$flag)=split '/', $n; |
$newnf{$word}.=" $nfnumber";
|
if ($nfnumber=$nfnumber{$f}) |
$nfkvo{$nfnumber}++;
|
{ |
$a=\$neww2k{$nfnumber};
|
$newnf{$word}.=" $nfnumber"; |
$$a.=packword($fieldnumber, $id,$wordnumber);
|
$nfkvo{$nfnumber}++; |
if (length $$a>100) {flushw2k($nfnumber)}
|
$a=\$neww2k{$nfnumber}; |
}
|
my $r=packword($fieldnumber, $id,$wordnumber); |
else
|
print "$word !!$r!\n"; |
{
|
$$a.=$r; |
$nfnumber=addnf(0, $f, $flag,1);
|
if (length $$a>100) {flushw2k($nfnumber)} |
$newnf{uc $word}.=" $nfnumber";
|
} |
$neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
|
else |
}
|
{ |
}
|
$nfnumber=addnf(0, $f, $flag,1); |
}
|
$newnf{uc $word}.=" $nfnumber"; |
|
my $r=packword($fieldnumber, $id,$wordnumber); |
}
|
print "$word !!!$r!\n"; |
}
|
$neww2k{$nfnumber}.=$r; |
|
} |
}
|
} |
|
} |
|
|
print "Filling word2question...\n";
|
} |
|
} |
foreach (keys %neww2k)
|
|
{
|
} |
updateword2question($_,$neww2k{$_});
|
|
delete $neww2k{$_};
|
|
}
|
print "Filling word2question...\n"; |
|
|
%neww2k=();
|
foreach (keys %neww2k) |
|
{ |
print "Filling nf...\n";
|
updateword2question($_,$neww2k{$_}); |
$sch=0;
|
delete $neww2k{$_}; |
|
} |
incnf($_,$nfkvo{$_}) foreach (keys %nfkvo);
|
|
|
%neww2k=(); |
%nfkvo=();
|
|
|
print "Filling nf...\n"; |
print "Filling nests...\n";
|
$sch=0; |
$sch=0;
|
|
|
incnf($_,$nfkvo{$_}) foreach (keys %nfkvo); |
|
|
foreach $w (keys %newnf)
|
%nfkvo=(); |
{
|
|
print "$sch\n" unless (++$sch % 1000);
|
print "Filling nests...\n"; |
@nf=split ' ',$newnf{$w};
|
$sch=0; |
addnest($w,$_) foreach @nf;
|
|
}
|
|
print "$sch nests added\n";
|
foreach $w (keys %newnf) |
|
{ |
print TIME "$commonI: \t$sch ";
|
print "$sch\n" unless (++$sch % 1000); |
print TIME "\t".(time-$initime)."\n";
|
@nf=split ' ',$newnf{$w}; |
%newnf=();
|
addnest($w,$_) foreach @nf; |
|
} |
}
|
print "$sch nests added\n"; |
|
|
sub flushw2k
|
print TIME "$commonI: \t$sch "; |
{
|
print TIME "\t".(time-$initime)."\n"; |
my ($n)=@_;
|
%newnf=(); |
updateword2question($n,$neww2k{$n});
|
|
delete $neww2k{$_};
|
} |
}
|
|
|
sub flushw2k |
|
{ |
|
my ($n)=@_; |
|
updateword2question($n,$neww2k{$n}); |
|
delete $neww2k{$_}; |
|
} |