db/prgsrc/updateRS1.pl - diff

Return to updateRS1.pl CVS log

Up to [Local Repository] / db / prgsrc

Diff for /db/prgsrc/updateRS1.pl between versions 1.1 and 1.7

-version 1.1, 2001/10/31 03:00:10
+version 1.7, 2002/06/15 03:43:48
  Line 1
- #!/usr/local/bin/perl -w
+ #!/usr/bin/perl -w
  =head1 NAME
  updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
  B<$base> информации о вопросах
  =head1 SYNOPSIS
  updateRS1.pl Questions_per_cicle cicles_number
  updateRS.pl Questions_per_cicle
  updateRS.pl
  =head1 DESCRIPTION
  Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
  добавляет информацию в таблицы word2question, nests, nf. Поле
  ProcessedBySearch устанавливается в 1. Обрабатывает
  Questions_per_cicle*cicles_number вопросов, сбрасывая
  информацию в базу каждые Questions_per_cicle вопросов.
  Значения по умолчанию:
  Questions_per_cicle=500;
  cicles_number=1.
  =head1 AUTHOR
  Роман Семизаров
  =cut
  use locale;
  use DBI;
  use POSIX qw (locale_h);
- use chgkfiles;
+ use lib "../lib";
- use dbchgk;
+ use chgkfiles;
- open (STDERR,">errors");
+ use dbchgk;
- my $initime=time;
+ open (STDERR,">errors");
- open TIME, ">time";
+ my $initime=time;
- do "common.pl";
+ open TIME, ">time";
- do "chgk.cnf";
+ do "common.pl";
+ do "chgk.cnf";
- $qlimit=shift||500;
- $times=shift||1;
+ $qlimit=shift||500;
+ $times=shift||1;
- die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
+ die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
- require "check.pl";
+ require "check.pl";
- open (STDERR,">$stderr") if $stderr;
+ open (STDERR,">$stderr") if $stderr;
- open (UNKNOWN,">$unknown");
+ open (UNKNOWN,">$unknown");
- my $nf;
+ my $nf;
- #open WARN, ">$warnings";
+ #open WARN, ">$warnings";
- %forbidden=checktable('equalto')? getequalto : ();
+ #%forbidden=checktable('equalto')? getequalto : ();
- if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
+ if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
- getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
+ getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
- print "Loading dictionaries\n";
+ print "Loading dictionaries\n";
- die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
+ die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
- foreach $d(@dictionaries)
- {
+ foreach $d(@dictionaries)
-      print "Loading $d\n";
+ {
-      open (DICT, $d) || print "              Not found\n";
+      print "Loading $d\n";
-      while ( <DICT> )
+      open (DICT, $d) || print "              Not found\n";
-      {
+      while ( <DICT> )
-         chomp;
+      {
-         s/\s*$//;
+         chomp;
-         ($aa,$b)=split(/\//);
+         s/\s*$//;
-         $a= uc $aa;
+         ($aa,$b)=split(/\//);
-         $words{$a}.=$b || "!";
+         $aa=~tr/ёЁ/еЕ/;
-      }
+         $a= uc $aa;
-      close(DICT);
+         $words{$a}.=$b || "!";
- }
+      }
+      close(DICT);
- die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
+ }
+ die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
- print "Getting words...\n";
- print TIME "\t\t".(time-$initime)."\n";
+ print "Getting words...\n";
- for my $commonI(1..$times)
+ print TIME "\t\t".(time-$initime)."\n";
- {
+ for my $commonI(1..$times)
+ {
- print "loading nests\n";
- %nf=getnests;
+ print "loading nests\n";
- %nfnumber=getnfnumbers;
+ %nf=getnests;
- print "Getting words...\n";
+ %nfnumber=getnfnumbers;
- getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
+ print "Getting words...\n";
- $sch=0;
- while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
+ getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
-                              # берём по одному вопросу
+ $sch=0;
-                              # и вешаем в массив, индексы которого -- словоформы,
+ while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
-                              # а значения -- списки вопросов.
+                              # берём по одному вопросу
- {
+                              # и вешаем в массив, индексы которого -- словоформы,
+                              # а значения -- списки вопросов.
-    print "$sch $id\n" ;#unless (++$sch % 1);
+ {
-    searchmark($id);
-    if ($forbidden{$id}) {next};
+    print "$sch $id\n" ;#unless (++$sch % 1);
-    foreach $fieldnumber (0..$#ss) #перебираем поля
+    searchmark($id);
-    {
+    if ($forbidden{$id}) {next};
-       $text=$ss[$fieldnumber];
+    foreach $fieldnumber (0..$#ss) #перебираем поля
-       next unless $text;
+    {
-       $text=~tr/ёЁ/еЕ/;
+       $text=$ss[$fieldnumber];
-       $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
+       next unless $text;
-       $text=~s/p(${RLrl})/р$1/gom;
+       $text=~tr/ёЁ/еЕ/;
-       $text=~s/(${RLrl})p/$1р/gom;
+       $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
-       $text=~s/\s+/ /gmo;
+       $text=~s/p(${RLrl})/р$1/gom;
-       @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
+       $text=~s/(${RLrl})p/$1р/gom;
+       $text=~s/\s+/ /gmo;
- foreach $wordnumber(0..$#list)
+       @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
-       {
-            $word=uc $list[$wordnumber];
+ foreach $wordnumber(0..$#list)
-            next if length $word>$maxwsize;
+       {
-            if (my $n=$nf{$word}||$newnf{$word})
+            $word=uc $list[$wordnumber];
-            {
+            next if length $word>$maxwsize;
-                 @n= split ' ',$n;
+            if (my $n=$nf{$word}||$newnf{$word})
-                 $nfkvo{$_}++ foreach @n;
+            {
-                 $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
+                 @n= split ' ',$n;
-                      foreach (@n);
+                 $nfkvo{$_}++ foreach @n;
-            }
+                 my $r=packword($fieldnumber, $id,$wordnumber%256);
-            else {
+                 $neww2k{$_}.=$r foreach (@n);
-              if ($word=~/^${RLrl}+$/o){ # Русское слово
+            }
-             # проанализировать по таблице аффиксов,
+            else {
-             # проверить наличие начальных форм в
+              if ($word=~/^${RLrl}+$/o){ # Русское слово
-             # nf, а если таких нет, то
+             # проанализировать по таблице аффиксов,
-             # и по словарю.
+             # проверить наличие начальных форм в
+             # nf, а если таких нет, то
-                $nf=&checkit(uc $word,\%words);
+             # и по словарю.
-                if (!$nf) {
-                            $nf=(uc $word)."/!";
+                $nf=&checkit(uc $word,\%words);
-                            print UNKNOWN "$nf\n" if $unknown;
+                if (!$nf) {
-                          }
+                            $nf=(uc $word)."/!";
+                            print UNKNOWN "$nf\n" if $unknown;
-               }  else {# нерусское слово
+                          }
-                          $nf=(uc $word)."/!";
-                          print UNKNOWN "$nf\n" if $unknown;
+               }  else {# нерусское слово
-                       }
+                          $nf=(uc $word)."/!";
+                          print UNKNOWN "$nf\n" if $unknown;
-                foreach $n (split ' ', $nf)
+                       }
-                {
-                    ($f,$flag)=split '/', $n;
+                foreach $n (split ' ', $nf)
-                    if ($nfnumber=$nfnumber{$f})
+                {
-                    {
+                    ($f,$flag)=split '/', $n;
-                        $newnf{$word}.=" $nfnumber";
+                    if ($nfnumber=$nfnumber{$f})
-                        $nfkvo{$nfnumber}++;
+                    {
-                        $a=\$neww2k{$nfnumber};
+                        $newnf{$word}.=" $nfnumber";
-                        $$a.=packword($fieldnumber, $id,$wordnumber);
+                        $nfkvo{$nfnumber}++;
-                        if (length $$a>100) {flushw2k($nfnumber)}
+                        $a=\$neww2k{$nfnumber};
-                    }
+                        my $r=packword($fieldnumber, $id,$wordnumber);
-                    else
+ print "$word !!$r!\n";
-                    {
+                        $$a.=$r;
-                       $nfnumber=addnf(0, $f, $flag,1);
+                        if (length $$a>100) {flushw2k($nfnumber)}
-                       $newnf{uc $word}.=" $nfnumber";
+                    }
-                       $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
+                    else
-                    }
+                    {
-                }
+                       $nfnumber=addnf(0, $f, $flag,1);
-            }
+                       $newnf{uc $word}.=" $nfnumber";
+                        my $r=packword($fieldnumber, $id,$wordnumber);
-       }
+ print "$word !!!$r!\n";
-    }
+                       $neww2k{$nfnumber}.=$r;
+                    }
- }
+                }
+            }
- print "Filling word2question...\n";
+       }
+    }
- foreach (keys %neww2k)
- {
+ }
-    updateword2question($_,$neww2k{$_});
-    delete $neww2k{$_};
- }
+ print "Filling word2question...\n";
- %neww2k=();
+ foreach (keys %neww2k)
+ {
- print "Filling nf...\n";
+    updateword2question($_,$neww2k{$_});
- $sch=0;
+    delete $neww2k{$_};
+ }
- incnf($_,$nfkvo{$_})  foreach (keys %nfkvo);
+ %neww2k=();
- %nfkvo=();
+ print "Filling nf...\n";
- print "Filling nests...\n";
+ $sch=0;
- $sch=0;
+ incnf($_,$nfkvo{$_})  foreach (keys %nfkvo);
- foreach $w (keys %newnf)
+ %nfkvo=();
- {
-   print "$sch\n" unless (++$sch % 1000);
+ print "Filling nests...\n";
-   @nf=split ' ',$newnf{$w};
+ $sch=0;
-   addnest($w,$_) foreach @nf;
- }
- print "$sch nests added\n";
+ foreach $w (keys %newnf)
+ {
- print TIME "$commonI: \t$sch ";
+   print "$sch\n" unless (++$sch % 1000);
- print TIME "\t".(time-$initime)."\n";
+   @nf=split ' ',$newnf{$w};
- %newnf=();
+   addnest($w,$_) foreach @nf;
+ }
- }
+ print "$sch nests added\n";
- sub flushw2k
+ print TIME "$commonI: \t$sch ";
- {
+ print TIME "\t".(time-$initime)."\n";
-    my ($n)=@_;
+ %newnf=();
-    updateword2question($n,$neww2k{$n});
-    delete $neww2k{$_};
+ }
- }
+ sub flushw2k
+ {
+    my ($n)=@_;
+    updateword2question($n,$neww2k{$n});
+    delete $neww2k{$_};
+ }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.1
changed lines
	Added in v.1.7