| 1 |
$revisions=<<END; |
|---|
| 2 |
Revisions |
|---|
| 3 |
|
|---|
| 4 |
v0.17 - 08 Mar 07 |
|---|
| 5 |
- Support of categories |
|---|
| 6 |
|
|---|
| 7 |
v0.16 - 29 Feb 07 |
|---|
| 8 |
- Using localized image namespace (e.g. Bild: for de) |
|---|
| 9 |
|
|---|
| 10 |
v0.15 - 25 Feb 07 |
|---|
| 11 |
- Changed deaccenting method to make it locale-independant |
|---|
| 12 |
- FIX locale-independant sort command |
|---|
| 13 |
|
|---|
| 14 |
v0.10 - 20 Feb 07 |
|---|
| 15 |
- Packing of image and math directories in at most 256 files for more convenient copy to device |
|---|
| 16 |
|
|---|
| 17 |
v0.9 - 17 Feb 07 |
|---|
| 18 |
- Added support for math notations (requires LaTeX) |
|---|
| 19 |
- Added some sanity checks on external dependencies |
|---|
| 20 |
- Added ability to take the archive as argument instead of the language. directories are created and archive is moved over as necessary |
|---|
| 21 |
- Better error management |
|---|
| 22 |
- Experimental support for automatic dump download (requires curl) |
|---|
| 23 |
- Better command line syntax to activate dump download, image download and math support |
|---|
| 24 |
|
|---|
| 25 |
END |
|---|
| 26 |
|
|---|
| 27 |
|
|---|
| 28 |
|
|---|
| 29 |
|
|---|
| 30 |
|
|---|
| 31 |
|
|---|
| 32 |
|
|---|
| 33 |
|
|---|
| 34 |
|
|---|
| 35 |
|
|---|
| 36 |
|
|---|
| 37 |
$sortcmd="sort"; |
|---|
| 38 |
$curlcmd="curl"; |
|---|
| 39 |
$latexcmd="latex"; |
|---|
| 40 |
$dvipscmd="dvipng"; |
|---|
| 41 |
|
|---|
| 42 |
|
|---|
| 43 |
|
|---|
| 44 |
|
|---|
| 45 |
|
|---|
| 46 |
$sortmethod=1; |
|---|
| 47 |
|
|---|
| 48 |
|
|---|
| 49 |
|
|---|
| 50 |
|
|---|
| 51 |
|
|---|
| 52 |
|
|---|
| 53 |
|
|---|
| 54 |
|
|---|
| 55 |
|
|---|
| 56 |
$imgsupport = 0; |
|---|
| 57 |
|
|---|
| 58 |
|
|---|
| 59 |
$thumbmediaprefix="http://upload.wikimedia.org/wikipedia/%s/thumb"; |
|---|
| 60 |
$fullmediaprefix="http://upload.wikimedia.org/wikipedia/%s"; |
|---|
| 61 |
$imgsize=120; |
|---|
| 62 |
|
|---|
| 63 |
|
|---|
| 64 |
%allowedexts = ( |
|---|
| 65 |
"jpg" => 1, |
|---|
| 66 |
"jpeg" => 1, |
|---|
| 67 |
"png" => 1, |
|---|
| 68 |
"svg" => 1, |
|---|
| 69 |
"gif" => 1, |
|---|
| 70 |
); |
|---|
| 71 |
|
|---|
| 72 |
|
|---|
| 73 |
|
|---|
| 74 |
|
|---|
| 75 |
|
|---|
| 76 |
|
|---|
| 77 |
$mathsupport = 0; |
|---|
| 78 |
|
|---|
| 79 |
|
|---|
| 80 |
|
|---|
| 81 |
|
|---|
| 82 |
$downloadsupport=0; |
|---|
| 83 |
$downloadurl="http://download.wikimedia.org/LANGwiki/latest/LANGwiki-latest-pages-articles.xml.bz2"; |
|---|
| 84 |
$downloadname="LANGwiki-latest-pages-articles.xml.bz2"; |
|---|
| 85 |
|
|---|
| 86 |
|
|---|
| 87 |
|
|---|
| 88 |
|
|---|
| 89 |
$catsupport=0; |
|---|
| 90 |
$cathashlen=4; |
|---|
| 91 |
$maxcatrefs=16; |
|---|
| 92 |
|
|---|
| 93 |
|
|---|
| 94 |
|
|---|
| 95 |
|
|---|
| 96 |
%namespaces=( |
|---|
| 97 |
-2 => 0, |
|---|
| 98 |
-1 => 0, |
|---|
| 99 |
1 => 0, |
|---|
| 100 |
2 => 0, |
|---|
| 101 |
3 => 0, |
|---|
| 102 |
4 => 0, |
|---|
| 103 |
5 => 0, |
|---|
| 104 |
6 => 0, |
|---|
| 105 |
7 => 0, |
|---|
| 106 |
8 => 0, |
|---|
| 107 |
9 => 0, |
|---|
| 108 |
10 => 0, |
|---|
| 109 |
11 => 0, |
|---|
| 110 |
12 => 0, |
|---|
| 111 |
13 => 0, |
|---|
| 112 |
14 => 0, |
|---|
| 113 |
15 => 0, |
|---|
| 114 |
100 => 0, |
|---|
| 115 |
101 => 0, |
|---|
| 116 |
102 => 0, |
|---|
| 117 |
103 => 0, |
|---|
| 118 |
104 => 0, |
|---|
| 119 |
105 => 0, |
|---|
| 120 |
); |
|---|
| 121 |
|
|---|
| 122 |
|
|---|
| 123 |
|
|---|
| 124 |
|
|---|
| 125 |
$pausefreq=200; |
|---|
| 126 |
$pauselen=5; |
|---|
| 127 |
|
|---|
| 128 |
|
|---|
| 129 |
|
|---|
| 130 |
$packfile="pack.dat"; |
|---|
| 131 |
$idxfile="idx.dat"; |
|---|
| 132 |
$chunksize=10000000; |
|---|
| 133 |
|
|---|
| 134 |
|
|---|
| 135 |
|
|---|
| 136 |
|
|---|
| 137 |
|
|---|
| 138 |
|
|---|
| 139 |
|
|---|
| 140 |
use Encode; |
|---|
| 141 |
use File::Copy; |
|---|
| 142 |
use utf8; |
|---|
| 143 |
|
|---|
| 144 |
$usage = <<END; |
|---|
| 145 |
$0: Offline wikipedia search index generator. |
|---|
| 146 |
Usage: |
|---|
| 147 |
$0 RootPath |
|---|
| 148 |
If dump file is already present in RootPath/bz2 |
|---|
| 149 |
|
|---|
| 150 |
$0 DumpFile |
|---|
| 151 |
Script will guess language and create appropriate directories |
|---|
| 152 |
|
|---|
| 153 |
Options: |
|---|
| 154 |
--image/img: download image thumbnails from wikipedia |
|---|
| 155 |
--cat: generate category indexes |
|---|
| 156 |
--math: generate images for <math></math> equations (requires latex) |
|---|
| 157 |
--download: download dump from wikimedia first (requires curl) |
|---|
| 158 |
N: skip first N archive files (the resulting index will be incomplete but it's useful when restarting downloads / math processings) |
|---|
| 159 |
--help: this message |
|---|
| 160 |
--version: revision information |
|---|
| 161 |
END |
|---|
| 162 |
|
|---|
| 163 |
$rootpath=shift @ARGV or die "$usage\n"; |
|---|
| 164 |
|
|---|
| 165 |
while($arg=shift @ARGV) { |
|---|
| 166 |
if($arg =~ /--image|img/i) {$imgsupport=1;} |
|---|
| 167 |
if($arg =~ /--math/i) {$mathsupport=1;} |
|---|
| 168 |
if($arg =~ /^\d+$/) {$skipfirst=$arg;} |
|---|
| 169 |
if($arg =~ /--download/i) {$downloadsupport=1;} |
|---|
| 170 |
if($arg =~ /--help/i) {die "$usage\n";} |
|---|
| 171 |
if($arg =~ /--version/i) {die "$revisions\n";} |
|---|
| 172 |
if($arg =~ /--cat(egor(y|ies))?/i) {$catsupport=1;} |
|---|
| 173 |
} |
|---|
| 174 |
|
|---|
| 175 |
if($rootpath =~ m/^([\w\-]+)wiki\-.*\.xml\.bz2$/i) { |
|---|
| 176 |
$masterfile=$rootpath; |
|---|
| 177 |
$rootpath=$1; |
|---|
| 178 |
} |
|---|
| 179 |
|
|---|
| 180 |
$archivepath="$rootpath/bz2"; |
|---|
| 181 |
$indexpath="$rootpath/db"; |
|---|
| 182 |
$imgpath="$rootpath/img"; |
|---|
| 183 |
$mathpath="$rootpath/math"; |
|---|
| 184 |
|
|---|
| 185 |
MakeDirs($rootpath); |
|---|
| 186 |
MakeDirs($archivepath); |
|---|
| 187 |
MakeDirs($indexpath); |
|---|
| 188 |
|
|---|
| 189 |
if(-e $masterfile) { |
|---|
| 190 |
move($masterfile, $archivepath); |
|---|
| 191 |
} |
|---|
| 192 |
|
|---|
| 193 |
if($imgsupport>0) { |
|---|
| 194 |
mkdir "$imgpath"; |
|---|
| 195 |
use Digest::MD5 qw(md5_hex); |
|---|
| 196 |
use HTTP::Request; |
|---|
| 197 |
use LWP::UserAgent; |
|---|
| 198 |
$ua = new LWP::UserAgent; |
|---|
| 199 |
$ua->agent("iPhonePedia"); |
|---|
| 200 |
} |
|---|
| 201 |
|
|---|
| 202 |
if($mathsupport>0) { |
|---|
| 203 |
mkdir "$mathpath"; |
|---|
| 204 |
use Digest::MD5 qw(md5_hex); |
|---|
| 205 |
} |
|---|
| 206 |
|
|---|
| 207 |
$starttime=time; |
|---|
| 208 |
@Titles=(); |
|---|
| 209 |
$pattern="*.xml.bz2"; |
|---|
| 210 |
|
|---|
| 211 |
$indexfile="Index.dat"; |
|---|
| 212 |
$bzip2recovercmd="bzip2recover"; |
|---|
| 213 |
$bzcatcmd="bzcat"; |
|---|
| 214 |
|
|---|
| 215 |
$imagecount=0; |
|---|
| 216 |
$failedimagecount=0; |
|---|
| 217 |
$foundimagecount=0; |
|---|
| 218 |
$equationcount=0; |
|---|
| 219 |
$foundequationcount=0; |
|---|
| 220 |
|
|---|
| 221 |
|
|---|
| 222 |
|
|---|
| 223 |
|
|---|
| 224 |
########################### |
|---|
| 225 |
# |
|---|
| 226 |
# SANITY |
|---|
| 227 |
# |
|---|
| 228 |
########################### |
|---|
| 229 |
|
|---|
| 230 |
if(not `$sortcmd --help 2>&1` =~ /usage/i) {print "Could not find correct external sort.\nFalling back on internal algorithm (slower).\n"; $sortmethod=3;} |
|---|
| 231 |
if($mathsupport and not (`$dvipscmd 2>&1` =~ m/usage/i && `$latexcmd --help 2>&1`=~ m/usage/i)) {die "Math support requires latex and dvipng.\nPlease install a LaTeX distribution to get those.\n";} |
|---|
| 232 |
if(not `$bzcatcmd --help 2>&1` =~ /usage/i) {die "Cannot find bzcat (tried with $bzcatcmd).\n";} |
|---|
| 233 |
if($downloadsupport and not `$curlcmd --help 2>&1` =~ /usage/i) {die "Curl is required for download support.\nPlease install it from http://curl.haxx.se\n";} |
|---|
| 234 |
|
|---|
| 235 |
print "\n"; |
|---|
| 236 |
print "== Root path: $rootpath\n"; |
|---|
| 237 |
print "== Archives path: $archivepath\n"; |
|---|
| 238 |
print "== Index path: $indexpath\n"; |
|---|
| 239 |
print "== Image path: $imgpath\n" if($imgsupport); |
|---|
| 240 |
print "== Math path: $mathpath\n" if($mathsupport); |
|---|
| 241 |
print "\n"; |
|---|
| 242 |
|
|---|
| 243 |
|
|---|
| 244 |
########################### |
|---|
| 245 |
# |
|---|
| 246 |
# SPLIT ARCHIVE IF NEEDED |
|---|
| 247 |
# |
|---|
| 248 |
########################### |
|---|
| 249 |
|
|---|
| 250 |
@list = glob("$archivepath/$pattern"); |
|---|
| 251 |
|
|---|
| 252 |
if(@list == 0) { |
|---|
| 253 |
$downloadurl =~ s/LANG/$rootpath/g; |
|---|
| 254 |
$downloadname =~ s/LANG/$rootpath/g; |
|---|
| 255 |
|
|---|
| 256 |
if($downloadsupport) { |
|---|
| 257 |
print "No archive found. Attempting to download it from wikimedia.\n"; |
|---|
| 258 |
`curl $downloadurl -o \"$archivepath/$downloadname\"`; |
|---|
| 259 |
if(! -e "$archivepath/$downloadname") {die "Could not download archive.\n$downloadurl does not seem to exist. Please download manually from http://download.wikimedia.org\n";} |
|---|
| 260 |
@list = glob("$archivepath/$pattern"); |
|---|
| 261 |
} |
|---|
| 262 |
else { |
|---|
| 263 |
die "Could not find any archive. Please make sure either the wikipedia dump or the splitted files are in $archivepath\nYou may want to download $downloadurl\n"; |
|---|
| 264 |
} |
|---|
| 265 |
} |
|---|
| 266 |
|
|---|
| 267 |
$downloadtime=int((time - $starttime)/60); |
|---|
| 268 |
|
|---|
| 269 |
if(@list == 1 && not $list[0] =~ /rec\d+/i) { |
|---|
| 270 |
$bigfile=$list[0]; |
|---|
| 271 |
print "Only one archive found ($bigfile). Splitting.\n"; |
|---|
| 272 |
|
|---|
| 273 |
print `$bzip2recovercmd $bigfile`; |
|---|
| 274 |
@list = glob("$archivepath/$pattern"); |
|---|
| 275 |
if(@list > 1) { |
|---|
| 276 |
print "Success!\n"; |
|---|
| 277 |
move($bigfile,"."); |
|---|
| 278 |
} |
|---|
| 279 |
} |
|---|
| 280 |
|
|---|
| 281 |
$archivetime=int((time - $starttime)/60); |
|---|
| 282 |
|
|---|
| 283 |
$allfiles=@list; |
|---|
| 284 |
|
|---|
| 285 |
|
|---|
| 286 |
|
|---|
| 287 |
########################### |
|---|
| 288 |
# |
|---|
| 289 |
# SCAN ARCHIVES |
|---|
| 290 |
# |
|---|
| 291 |
########################### |
|---|
| 292 |
|
|---|
| 293 |
open OUTFILE,">$indexfile"; |
|---|
| 294 |
foreach $bzfile (@list) { |
|---|
| 295 |
next if ($bzfile eq $bigfile); |
|---|
| 296 |
|
|---|
| 297 |
if($bzfile =~ m/rec(\d+)(\D+)wiki/) { |
|---|
| 298 |
$idx=int($1); |
|---|
| 299 |
next if($idx<$skipfirst && $idx!=1); |
|---|
| 300 |
|
|---|
| 301 |
print "Scanning file number $idx/$allfiles\n"; |
|---|
| 302 |
$content= `$bzcatcmd $bzfile`; |
|---|
| 303 |
#print "$content"; |
|---|
| 304 |
|
|---|
| 305 |
if($idx==1) { |
|---|
| 306 |
#extract namespace information |
|---|
| 307 |
while($content =~ m|<namespace key="([\d\-]*)">(.*?)</namespace>|ig) { |
|---|
| 308 |
unless($namespaces{$1}==1) { |
|---|
| 309 |
$ignorenamespaces{lc $2}=1; |
|---|
| 310 |
print "Ignoring namespace $2\n"; |
|---|
| 311 |
} |
|---|
| 312 |
if($1==6) { |
|---|
| 313 |
#save localized image namespace name |
|---|
| 314 |
$imgnamespace=$2; |
|---|
| 315 |
} |
|---|
| 316 |
if($1==14) { |
|---|
| 317 |
#save category namespace name |
|---|
| 318 |
$catnamespace=$2; |
|---|
| 319 |
} |
|---|
| 320 |
} |
|---|
| 321 |
if($imgnamespace) { |
|---|
| 322 |
print "Using '$imgnamespace:' namespace for images\n"; |
|---|
| 323 |
} |
|---|
| 324 |
if($catnamespace) { |
|---|
| 325 |
print "Using '$catnamespace:' namespace for categories\n"; |
|---|
| 326 |
} |
|---|
| 327 |
} |
|---|
| 328 |
|
|---|
| 329 |
while($content =~ m|<title>([^<]+)</title>|g) { |
|---|
| 330 |
$title=$1; |
|---|
| 331 |
$offset=pos($content)-length("<title>$title</title>"); |
|---|
| 332 |
$skip=0; |
|---|
| 333 |
foreach $ns (keys %ignorenamespaces) {if($title =~ /^$ns\:/i) {$skip=1;}} |
|---|
| 334 |
next if($skip); |
|---|
| 335 |
|
|---|
| 336 |
#$title=decode("utf8", $title); |
|---|
| 337 |
#$title=PlainAscii($title); |
|---|
| 338 |
|
|---|
| 339 |
#print "$title#$idx\n"; |
|---|
| 340 |
|
|---|
| 341 |
if($sortmethod==1) { |
|---|
| 342 |
print OUTFILE "$title#$idx-$offset\n"; |
|---|
| 343 |
} |
|---|
| 344 |
elsif($sortmethod==2) { |
|---|
| 345 |
push(@Titles, "$title#$idx-$offset"); |
|---|
| 346 |
} |
|---|
| 347 |
elsif($sortmethod==3) { |
|---|
| 348 |
print OUTFILE "$title#$idx-$offset\n"; |
|---|
| 349 |
#InsertSorted("$title#$idx-$offset"); |
|---|
| 350 |
} |
|---|
| 351 |
|
|---|
| 352 |
} |
|---|
| 353 |
|
|---|
| 354 |
|
|---|
| 355 |
if($imgsupport>0) { |
|---|
| 356 |
while($content =~ m/(\[|^|\=)\s*(image|$imgnamespace)\:(.*?)(\]|$)/mgi) { |
|---|
| 357 |
$foundimagecount++; |
|---|
| 358 |
$descr=$3; |
|---|
| 359 |
if($descr=~ m/(\d+)px/) {$width=$1;} |
|---|
| 360 |
else {$width=$imgsize;} |
|---|
| 361 |
$descr =~ s/\|.*//; |
|---|
| 362 |
$descr =~ s/\s+$//; |
|---|
| 363 |
$descr =~ s/^\s+//; |
|---|
| 364 |
|
|---|
| 365 |
#skip past attempts for inexistent images |
|---|
| 366 |
next if($failedimg{$descr}==1); |
|---|
| 367 |
|
|---|
| 368 |
print " (".GetProgress().")-Getting image #$imagecount: "; |
|---|
| 369 |
$returned=ProcessImage($descr,$width); |
|---|
| 370 |
|
|---|
| 371 |
if($returned==1) { |
|---|
| 372 |
print "ok\n"; |
|---|
| 373 |
$imagecount++; |
|---|
| 374 |
} |
|---|
| 375 |
elsif($returned==2) { |
|---|
| 376 |
print "already downloaded\n"; |
|---|
| 377 |
} |
|---|
| 378 |
else { |
|---|
| 379 |
print "failed\n"; |
|---|
| 380 |
$failedimg{$descr}=1; |
|---|
| 381 |
$failedimagecount++; |
|---|
| 382 |
} |
|---|
| 383 |
} |
|---|
| 384 |
} |
|---|
| 385 |
|
|---|
| 386 |
if($mathsupport>0) { |
|---|
| 387 |
while($content =~ m|\<math\>\s*(.*?)\s*\</math\>|mgi) { |
|---|
| 388 |
print " (".GetProgress().")-Rendering equation #$equationcount: "; |
|---|
| 389 |
ProcessEquation($1); |
|---|
| 390 |
} |
|---|
| 391 |
} |
|---|
| 392 |
|
|---|
| 393 |
if($catsupport>0) { |
|---|
| 394 |
while($content =~ m|\[\[$catnamespace\:\s*([^\]]*?)\s*\]\]|mgi) { |
|---|
| 395 |
$cat=$1; |
|---|
| 396 |
$cat =~ s/[\n\|].*//gi; |
|---|
| 397 |
$start=rindex($content, "<title>", pos($content)); |
|---|
| 398 |
$end=index($content, "</title>", $start); |
|---|
| 399 |
if($start>0 && $end>0) { |
|---|
| 400 |
$title=substr($content, $start+7, $end-$start-7); |
|---|
| 401 |
#print " Adding '$title' to category '$cat'\n"; |
|---|
| 402 |
push @{$catmembers{$cat}}, $title; |
|---|
| 403 |
} |
|---|
| 404 |
} |
|---|
| 405 |
} |
|---|
| 406 |
} |
|---|
| 407 |
} |
|---|
| 408 |
|
|---|
| 409 |
|
|---|
| 410 |
$exploretime=int((time - $starttime)/60); |
|---|
| 411 |
|
|---|
| 412 |
if($imgsupport) { |
|---|
| 413 |
print "== Packing image dir:\n"; |
|---|
| 414 |
PackImgDirectory(); |
|---|
| 415 |
|
|---|
| 416 |
print "== Image stats:\n"; |
|---|
| 417 |
print " Detected : $foundimagecount\n"; |
|---|
| 418 |
print " Downloaded: $imagecount\n"; |
|---|
| 419 |
print " Failed : $failedimagecount\n"; |
|---|
| 420 |
} |
|---|
| 421 |
|
|---|
| 422 |
if($mathsupport) { |
|---|
| 423 |
print "== Packing math dir:\n"; |
|---|
| 424 |
PackMathDirectory(); |
|---|
| 425 |
|
|---|
| 426 |
print "== Math stats:\n"; |
|---|
| 427 |
print " Found : $foundequationcount\n"; |
|---|
| 428 |
print " Generated : $equationcount\n"; |
|---|
| 429 |
} |
|---|
| 430 |
|
|---|
| 431 |
if($catsupport) { |
|---|
| 432 |
print "== Dumping categories:\n"; |
|---|
| 433 |
Dumpcat(); |
|---|
| 434 |
|
|---|
| 435 |
print "== Categories:\n"; |
|---|
| 436 |
print " Found : ".(keys %catmembers)."\n"; |
|---|
| 437 |
} |
|---|
| 438 |
|
|---|
| 439 |
|
|---|
| 440 |
|
|---|
| 441 |
########################### |
|---|
| 442 |
# |
|---|
| 443 |
# SORT Index.dat |
|---|
| 444 |
# |
|---|
| 445 |
########################### |
|---|
| 446 |
|
|---|
| 447 |
print "== Sorting\n"; |
|---|
| 448 |
|
|---|
| 449 |
$locale=$ENV{"LC_ALL"}; |
|---|
| 450 |
$ENV{"LC_ALL"}="C"; |
|---|
| 451 |
|
|---|
| 452 |
if($sortmethod==1) { |
|---|
| 453 |
close OUTFILE; |
|---|
| 454 |
print `$sortcmd -f -o \"$indexfile\" \"$indexfile\"`; |
|---|
| 455 |
#other options -t \'#\' -k 1,1 |
|---|
| 456 |
} |
|---|
| 457 |
elsif($sortmethod==2) { |
|---|
| 458 |
merge_sort(\@Titles, 0, scalar(@Titles)-1); |
|---|
| 459 |
foreach $elt (@Titles) { |
|---|
| 460 |
print OUTFILE "$elt\n"; |
|---|
| 461 |
} |
|---|
| 462 |
close OUTFILE; |
|---|
| 463 |
} |
|---|
| 464 |
elsif($sortmethod==3) { |
|---|
| 465 |
close OUTFILE; |
|---|
| 466 |
open OUTFILE,"<$indexfile"; |
|---|
| 467 |
while(<OUTFILE>) { |
|---|
| 468 |
chomp; |
|---|
| 469 |
InsertSorted("$_"); |
|---|
| 470 |
} |
|---|
| 471 |
close OUTFILE; |
|---|
| 472 |
open OUTFILE,">$indexfile"; |
|---|
| 473 |
foreach $elt (@Titles) { |
|---|
| 474 |
print OUTFILE "$elt\n"; |
|---|
| 475 |
} |
|---|
| 476 |
close OUTFILE; |
|---|
| 477 |
} |
|---|
| 478 |
|
|---|
| 479 |
$ENV{"LC_ALL"}=$locale; |
|---|
| 480 |
|
|---|
| 481 |
|
|---|
| 482 |
$sorttime=int((time - $starttime)/60); |
|---|
| 483 |
|
|---|
| 484 |
|
|---|
| 485 |
|
|---|
| 486 |
########################### |
|---|
| 487 |
# |
|---|
| 488 |
# BUILD FULL-TEXT INDEX |
|---|
| 489 |
# |
|---|
| 490 |
########################### |
|---|
| 491 |
|
|---|
| 492 |
# Path to index database files |
|---|
| 493 |
$HASH = "${indexpath}/0_hash"; |
|---|
| 494 |
$HASHWORDS = "${indexpath}/0_hashwords"; |
|---|
| 495 |
$FINFO = "${indexpath}/0_finfo"; |
|---|
| 496 |
$SITEWORDS = "${indexpath}/0_sitewords"; |
|---|
| 497 |
$WORD_IND = "${indexpath}/0_word_ind"; |
|---|
| 498 |
|
|---|
| 499 |
|
|---|
| 500 |
|
|---|
| 501 |
#minimum word length to index |
|---|
| 502 |
$min_length = 3; |
|---|
| 503 |
|
|---|
| 504 |
# Index or not numbers (set $numbers = "" if you don't want to index numbers) |
|---|
| 505 |
|
|---|
| 506 |
$numbers = '0-9'; |
|---|
| 507 |
|
|---|
| 508 |
|
|---|
| 509 |
|
|---|
| 510 |
|
|---|
| 511 |
|
|---|
| 512 |
$INDEXING_SCHEME = 1; |
|---|
| 513 |
|
|---|
| 514 |
|
|---|
| 515 |
$stop_words = "and any are but can had has have her here him his how its not our out per she some than that the their them then there these they was were what you"; |
|---|
| 516 |
|
|---|
| 517 |
|
|---|
| 518 |
$HASHSIZE = 300001; |
|---|
| 519 |
|
|---|
| 520 |
|
|---|
| 521 |
@stop_words=split(/\s+/,$stop_words); |
|---|
| 522 |
foreach $stopword (@stop_words) {$stop_words_array{$stopword}=1; } |
|---|
| 523 |
|
|---|
| 524 |
|
|---|
| 525 |
print "== Start indexing\n"; |
|---|
| 526 |
|
|---|
| 527 |
|
|---|
| 528 |
|
|---|
| 529 |
$cfn = 0; |
|---|
| 530 |
$cwn = 0; |
|---|
| 531 |
|
|---|
| 532 |
if(! -d "db") { |
|---|
| 533 |
mkdir("db",0755) or die("Can't create directory DB!!!"); |
|---|
| 534 |
print "== Directory 'db' has been created\n"; |
|---|
| 535 |
} |
|---|
| 536 |
|
|---|
| 537 |
|
|---|
| 538 |
|
|---|
| 539 |
mkdir($indexpath); |
|---|
| 540 |
open(fp_FINFO,">$FINFO") or die("Can't open index file!\n"); |
|---|
| 541 |
open(fp_SITEWORDS ,">$SITEWORDS") or die("Can't open index file!\n"); |
|---|
| 542 |
open(fp_WORD_IND,">$WORD_IND") or die("Can't open index file!\n"); |
|---|
| 543 |
|
|---|
| 544 |
binmode fp_FINFO; |
|---|
| 545 |
binmode fp_SITEWORDS; |
|---|
| 546 |
binmode fp_WORD_IND; |
|---|
| 547 |
|
|---|
| 548 |
print fp_FINFO "\x0A"; |
|---|
| 549 |
|
|---|
| 550 |
scan_list("$indexfile"); |
|---|
| 551 |
|
|---|
| 552 |
if ($cfn == 0) { |
|---|
| 553 |
die "No files are indexed\n"; |
|---|
| 554 |
} |
|---|
| 555 |
|
|---|
| 556 |
print "== Computing word hash\n"; |
|---|
| 557 |
$pos_sitewords = tell(fp_SITEWORDS); |
|---|
| 558 |
$pos_word_ind = tell(fp_WORD_IND); |
|---|
| 559 |
$to_print_sitewords = ""; |
|---|
| 560 |
$to_print_word_ind = ""; |
|---|
| 561 |
foreach $word (keys %words) { |
|---|
| 562 |
$value=$words{$word}; |
|---|
| 563 |
$cwn++; |
|---|
| 564 |
$words_word_dum = pack("NN",$pos_sitewords+length($to_print_sitewords), |
|---|
| 565 |
$pos_word_ind+length($to_print_word_ind)); |
|---|
| 566 |
$to_print_sitewords .= "$word\x0A"; |
|---|
| 567 |
$to_print_word_ind .= pack("N",length($value)/4).$value; |
|---|
| 568 |
$words{$word} = $words_word_dum; |
|---|
| 569 |
|
|---|
| 570 |
|
|---|
| 571 |
if (length($to_print_word_ind) > 32000) { |
|---|
| 572 |
print fp_SITEWORDS $to_print_sitewords; |
|---|
| 573 |
print fp_WORD_IND $to_print_word_ind; |
|---|
| 574 |
$to_print_sitewords = ""; |
|---|
| 575 |
$to_print_word_ind = ""; |
|---|
| 576 |
$pos_sitewords = tell(fp_SITEWORDS); |
|---|
| 577 |
$pos_word_ind = tell(fp_WORD_IND); |
|---|
| 578 |
} |
|---|
| 579 |
|
|---|
| 580 |
} |
|---|
| 581 |
print fp_SITEWORDS $to_print_sitewords; |
|---|
| 582 |
print fp_WORD_IND $to_print_word_ind; |
|---|
| 583 |
|
|---|
| 584 |
close(fp_SITEWORDS); |
|---|
| 585 |
close(fp_WORD_IND); |
|---|
| 586 |
|
|---|
| 587 |
$indextime=int((time - $starttime)/60); |
|---|
| 588 |
|
|---|
| 589 |
print "== Dumping hash\n"; |
|---|
| 590 |
|
|---|
| 591 |
build_hash(); |
|---|
| 592 |
|
|---|
| 593 |
print "== $cfn entries indexed\n"; |
|---|
| 594 |
|
|---|
| 595 |
|
|---|
| 596 |
$dumptime=int((time - $starttime)/60); |
|---|
| 597 |
|
|---|
| 598 |
|
|---|
| 599 |
|
|---|
| 600 |
|
|---|
| 601 |
|
|---|
| 602 |
|
|---|
| 603 |
|
|---|
| 604 |
unlink <temp.*>; |
|---|
| 605 |
unlink $indexfile; |
|---|
| 606 |
|
|---|
| 607 |
print "\n"; |
|---|
| 608 |
print "- Download time: ${downloadtime}m\n" if($downloadsupport); |
|---|
| 609 |
print "- Archive time: ${archivetime}m\n"; |
|---|
| 610 |
print "- Explore time: ${exploretime}m\n"; |
|---|
| 611 |
print "- Sort time: ${sorttime}m\n"; |
|---|
| 612 |
print "- Index time: ${indextime}m\n"; |
|---|
| 613 |
print "- Dump time: ${dumptime}m\n"; |
|---|
| 614 |
|
|---|
| 615 |
|
|---|
| 616 |
|
|---|
| 617 |
|
|---|
| 618 |
|
|---|
| 619 |
|
|---|
| 620 |
|
|---|
| 621 |
|
|---|
| 622 |
|
|---|
| 623 |
|
|---|
| 624 |
|
|---|
| 625 |
|
|---|
| 626 |
|
|---|
| 627 |
|
|---|
| 628 |
|
|---|
| 629 |
|
|---|
| 630 |
|
|---|
| 631 |
|
|---|
| 632 |
|
|---|
| 633 |
|
|---|
| 634 |
sub risearch_hash { |
|---|
| 635 |
my ($key)=@_; |
|---|
| 636 |
@chars = split(//,$key); |
|---|
| 637 |
for($i=0;$i<@chars;$i++) { |
|---|
| 638 |
$chars2[$i] = ord($chars[$i]); |
|---|
| 639 |
} |
|---|
| 640 |
|
|---|
| 641 |
$h = hex("00000000"); |
|---|
| 642 |
$f = hex("0F000000"); |
|---|
| 643 |
|
|---|
| 644 |
for($i=0;$i<@chars;$i++) { |
|---|
| 645 |
$h = ($h << 4) + $chars2[$i]; |
|---|
| 646 |
if ($g = $h & $f) { $h ^= $g >> 24; }; |
|---|
| 647 |
$h &= ~$g; |
|---|
| 648 |
} |
|---|
| 649 |
|
|---|
| 650 |
return $h; |
|---|
| 651 |
|
|---|
| 652 |
} |
|---|
| 653 |
|
|---|
| 654 |
|
|---|
| 655 |
|
|---|
| 656 |
|
|---|
| 657 |
|
|---|
| 658 |
|
|---|
| 659 |
|
|---|
| 660 |
|
|---|
| 661 |
sub index_title { |
|---|
| 662 |
my ($textindex,$url) = @_; |
|---|
| 663 |
my %words_temp; |
|---|
| 664 |
|
|---|
| 665 |
|
|---|
| 666 |
$cfn++; |
|---|
| 667 |
|
|---|
| 668 |
|
|---|
| 669 |
$textindex = decode_utf8($textindex); |
|---|
| 670 |
|
|---|
| 671 |
$textindex = RemoveHTMLentities($textindex); |
|---|
| 672 |
|
|---|
| 673 |
$textindex =~ s/[^\w\d -]/ /g; |
|---|
| 674 |
$textindex =~ s/\s+/ /g; |
|---|
| 675 |
|
|---|
| 676 |
$textindex = PlainAscii($textindex); |
|---|
| 677 |
|
|---|
| 678 |
|
|---|
| 679 |
$textindex = encode_utf8($textindex); |
|---|
| 680 |
@words_temp=split(/\s+/,$textindex); |
|---|
| 681 |
|
|---|
| 682 |
$pos = tell(fp_FINFO); |
|---|
| 683 |
$pos = pack("N",$pos); |
|---|
| 684 |
print fp_FINFO "$url\x0A"; |
|---|
| 685 |
|
|---|
| 686 |
|
|---|
| 687 |
foreach $word (@words_temp) { |
|---|
| 688 |
next if (length($word) < $min_length); |
|---|
| 689 |
next if ($stop_words_array{$word}); |
|---|
| 690 |
$words{$word} .= $pos; |
|---|
| 691 |
|
|---|
| 692 |
|
|---|
| 693 |
} |
|---|
| 694 |
} |
|---|
| 695 |
|
|---|
| 696 |
|
|---|
| 697 |
|
|---|
| 698 |
|
|---|
| 699 |
|
|---|
| 700 |
|
|---|
| 701 |
|
|---|
| 702 |
|
|---|
| 703 |
sub build_hash { |
|---|
| 704 |
|
|---|
| 705 |
for ($i=0; $i<$HASHSIZE; $i++) {$hash_array[$i] = "";}; |
|---|
| 706 |
|
|---|
| 707 |
foreach $word (keys %words) { |
|---|
| 708 |
$value=$words{$word}; |
|---|
| 709 |
if ($INDEXING_SCHEME == 3) { $subbound = length($word)-3; } |
|---|
| 710 |
else { $subbound = 1; } |
|---|
| 711 |
if (length($word)==3) {$subbound = 1;} |
|---|
| 712 |
$substring_length = 4; |
|---|
| 713 |
if ($INDEXING_SCHEME == 1) { $substring_length = length($word); } |
|---|
| 714 |
|
|---|
| 715 |
for ($i=0; $i<$subbound; $i++){ |
|---|
| 716 |
$hash_value = abs(risearch_hash(substr($word,$i,$substring_length)) % $HASHSIZE); |
|---|
| 717 |
$hash_array[$hash_value] .= $value; |
|---|
| 718 |
} |
|---|
| 719 |
|
|---|
| 720 |
} |
|---|
| 721 |
|
|---|
| 722 |
open(fp_HASH, ">$HASH") or die("Can't open index file!"); |
|---|
| 723 |
open(fp_HASHWORDS,">$HASHWORDS") or die("Can't open index file!"); |
|---|
| 724 |
|
|---|
| 725 |
binmode fp_HASH; |
|---|
| 726 |
binmode fp_HASHWORDS; |
|---|
| 727 |
|
|---|
| 728 |
$zzz = pack("N", 0); |
|---|
| 729 |
print fp_HASHWORDS $zzz; |
|---|
| 730 |
$pos_hashwords = tell(fp_HASHWORDS); |
|---|
| 731 |
$to_print_hash = ""; |
|---|
| 732 |
$to_print_hashwords = ""; |
|---|
| 733 |
|
|---|
| 734 |
for ($i=0; $i<$HASHSIZE; $i++){ |
|---|
| 735 |
$elt=$hash_array[$i]; |
|---|
| 736 |
if ($elt eq "") {$to_print_hash .= $zzz;} |
|---|
| 737 |
else { |
|---|
| 738 |
$to_print_hash .= pack("N",$pos_hashwords + length($to_print_hashwords)); |
|---|
| 739 |
$to_print_hashwords .= pack("N", length($elt)/8).$elt; |
|---|
| 740 |
|
|---|
| 741 |
} |
|---|
| 742 |
if (length($to_print_hashwords) > 64000) { |
|---|
| 743 |
print fp_HASH $to_print_hash; |
|---|
| 744 |
print fp_HASHWORDS $to_print_hashwords; |
|---|
| 745 |
$to_print_hash = ""; |
|---|
| 746 |
$to_print_hashwords = ""; |
|---|
| 747 |
$pos_hashwords = tell(fp_HASHWORDS); |
|---|
| 748 |
} |
|---|
| 749 |
} |
|---|
| 750 |
print fp_HASH $to_print_hash; |
|---|
| 751 |
print fp_HASHWORDS $to_print_hashwords; |
|---|
| 752 |
|
|---|
| 753 |
|
|---|
| 754 |
close(fp_HASH); |
|---|
| 755 |
close(fp_HASHWORDS); |
|---|
| 756 |
|
|---|
| 757 |
|
|---|
| 758 |
} |
|---|
| 759 |
|
|---|
| 760 |
|
|---|
| 761 |
|
|---|
| 762 |
|
|---|
| 763 |
|
|---|
| 764 |
|
|---|
| 765 |
|
|---|
| 766 |
|
|---|
| 767 |
|
|---|
| 768 |
|
|---|
| 769 |
|
|---|
| 770 |
|
|---|
| 771 |
sub scan_list { |
|---|
| 772 |
my ($dbfile)=@_; |
|---|
| 773 |
|
|---|
| 774 |
print "== Scanning $dbfile\n"; |
|---|
| 775 |
open(FILE,$dbfile) or print "Cannot open $dbfile\n"; |
|---|
| 776 |
|
|---|
| 777 |
while(<FILE>) { |
|---|
| 778 |
$line=$_; |
|---|
| 779 |
chomp $line; |
|---|
| 780 |
if($line =~ /^(.*) |
|---|
| 781 |
|
|---|
| 782 |
index_title($1,$line); |
|---|
| 783 |
} |
|---|
| 784 |
} |
|---|
| 785 |
close(FILE); |
|---|
| 786 |
} |
|---|
| 787 |
|
|---|
| 788 |
sub RemoveHTMLentities { |
|---|
| 789 |
my ($text) = @_; |
|---|
| 790 |
my (%entities, $key, $subst); |
|---|
| 791 |
|
|---|
| 792 |
%entities = ( "&" => "&", |
|---|
| 793 |
"–" => "-", |
|---|
| 794 |
"<" => "<", |
|---|
| 795 |
">" => ">", |
|---|
| 796 |
""e;" => "\"", |
|---|
| 797 |
""" => "\'" |
|---|
| 798 |
); |
|---|
| 799 |
|
|---|
| 800 |
foreach $key (keys %entities) |
|---|
| 801 |
{ |
|---|
| 802 |
$subst = $entities{$key}; |
|---|
| 803 |
$text =~ s/$key/$subst/g; |
|---|
| 804 |
} |
|---|
| 805 |
|
|---|
| 806 |
return $text; |
|---|
| 807 |
} |
|---|
| 808 |
|
|---|
| 809 |
sub GetMediaHash { |
|---|
| 810 |
my ($medianame) = @_; |
|---|
| 811 |
my ($md5, $path, $fileprefix); |
|---|
| 812 |
|
|---|
| 813 |
|
|---|
| 814 |
$medianame = ucfirst $medianame; |
|---|
| 815 |
$medianame =~ s/%(..)/pack("c",hex($1))/ge; |
|---|
| 816 |
$medianame =~ s/\s/_/g; |
|---|
| 817 |
|
|---|
| 818 |
|
|---|
| 819 |
$medianame =~ s/\>/_/g; |
|---|
| 820 |
$medianame =~ s/\</_/g; |
|---|
| 821 |
|
|---|
| 822 |
|
|---|
| 823 |
|
|---|
| 824 |
|
|---|
| 825 |
|
|---|
| 826 |
|
|---|
| 827 |
|
|---|
| 828 |
$md5 = md5_hex($medianame); |
|---|
| 829 |
|
|---|
| 830 |
return $md5; |
|---|
| 831 |
} |
|---|
| 832 |
|
|---|
| 833 |
sub ProcessImage { |
|---|
| 834 |
my ($title,$width)=@_; |
|---|
| 835 |
my ($path, $onlinepath, $outpath, $outfile, $langpath, $ext); |
|---|
| 836 |
|
|---|
| 837 |
|
|---|
| 838 |
if($title =~ m/\.([^\.]+)$/i) {$ext=lc $1;} |
|---|
| 839 |
|
|---|