Assembla home | Assembla project page
 

InstallationInstructions: makeindex.pl

File makeindex.pl, 30.6 kB (added by kinglothar, 9 months ago)
Line 
1 $revisions=<<END;
2 Revisions
3
4 v0.17 - 08 Mar 07
5  - Support of categories
6
7 v0.16 - 29 Feb 07
8  - Using localized image namespace (e.g. Bild: for de)
9
10 v0.15 - 25 Feb 07
11  - Changed deaccenting method to make it locale-independant
12  - FIX locale-independant sort command
13
14 v0.10 - 20 Feb 07
15  - Packing of image and math directories in at most 256 files for more convenient copy to device
16
17 v0.9 - 17 Feb 07
18  - Added support for math notations (requires LaTeX)
19  - Added some sanity checks on external dependencies
20  - Added ability to take the archive as argument instead of the language. directories are created and archive is moved over as necessary
21  - Better error management
22  - Experimental support for automatic dump download (requires curl)
23  - Better command line syntax to activate dump download, image download and math support
24
25 END
26
27
28 ###########################
29 #
30 #  CONFIGURATION
31 #
32 ###########################
33
34
35 # External Commands
36 #
37 $sortcmd="sort";       # IMPORTANT!!!! do NOT use the default windows sort command, it's way too slow. Use the Cygwin version
38 $curlcmd="curl";       #required for direct dump download
39 $latexcmd="latex";     #required for math support
40 $dvipscmd="dvipng";    #required for math support
41
42 #sortmethod:
43 #1 for external sort command (preferred if you have the GNU sort),
44 #2 for an in-place mergesort !!very slow!! don't use...
45 #3 for an insertion sort
46 $sortmethod=1;
47
48
49 # Images
50 #
51 #imgsupport:
52 #0 for no image,
53 #1 for download and resize image,
54 #2 for image links (not supported yet)
55 # overriden by "image" or "img" command line option
56 $imgsupport = 0;
57
58 #Configuration for image handling
59 $thumbmediaprefix="http://upload.wikimedia.org/wikipedia/%s/thumb";
60 $fullmediaprefix="http://upload.wikimedia.org/wikipedia/%s";
61 $imgsize=120;
62
63 #extensions to download when $imgsupport is enabled
64 %allowedexts = (
65         "jpg" => 1,
66         "jpeg" => 1,
67         "png" => 1,
68         "svg" => 1, #svg is converted to png at download time
69         "gif" => 1,
70 );
71
72
73 # Math
74 #
75 # requires a latex distrtibution (such as http://www.tug.org/protext/ for windows)
76 # overriden by "math" command line option
77 $mathsupport = 0;
78
79
80 # Download
81 #
82 $downloadsupport=0;
83 $downloadurl="http://download.wikimedia.org/LANGwiki/latest/LANGwiki-latest-pages-articles.xml.bz2";
84 $downloadname="LANGwiki-latest-pages-articles.xml.bz2";
85
86
87 # Categories
88 #
89 $catsupport=0;
90 $cathashlen=4;
91 $maxcatrefs=16;
92
93 # Namespaces
94 #
95 #List of namespaces to ignore, identified by their language-independent index. 0 = ignore, 1=include
96 %namespaces=(
97       -2 => 0, #Media
98       -1 => 0, #Special
99       1 => 0, #Talk
100       2 => 0, #User
101       3 => 0, #User talk
102       4 => 0, #Wikipedia
103       5 => 0, #Wikipedia talk
104       6 => 0, #Image
105       7 => 0, #Image talk
106       8 => 0, #MediaWiki
107       9 => 0, #MediaWiki talk
108       10 => 0, #>Template
109       11 => 0, #>Template talk
110       12 => 0, #>Help
111       13 => 0, #>Help talk
112       14 => 0, #>Category
113       15 => 0, #>Category talk
114       100 => 0, #Portail
115       101 => 0, #Discussion Portail
116       102 => 0, #Projet
117       103 => 0, #Discussion Projet
118       104 => 0, #R�f�rence
119       105 => 0, #Discussion R�f�rence
120 );
121
122
123 # Pauses when launching external commands
124 # explorer.exe apparently leaks memory when you keep launching external commands...
125 $pausefreq=200;  #every nth launch of external cmd. Enter 0 for no pause
126 $pauselen=5;     #pause duration in seconds
127
128
129 # Packing
130 $packfile="pack.dat";
131 $idxfile="idx.dat";
132 $chunksize=10000000;
133
134 ###########################
135 #
136 #  INIT
137 #
138 ###########################
139
140 use Encode;
141 use File::Copy;
142 use utf8;
143
144 $usage = <<END;
145 $0: Offline wikipedia search index generator.
146 Usage:
147 $0 RootPath
148  If dump file is already present in RootPath/bz2
149  
150 $0 DumpFile
151  Script will guess language and create appropriate directories
152
153 Options:
154  --image/img: download image thumbnails from wikipedia
155  --cat:       generate category indexes
156  --math:      generate images for <math></math> equations (requires latex)
157  --download:  download dump from wikimedia first (requires curl)
158  N:           skip first N archive files (the resulting index will be incomplete but it's useful when restarting downloads / math processings)
159  --help:      this message
160  --version:   revision information
161 END
162
163 $rootpath=shift @ARGV or die "$usage\n";
164
165 while($arg=shift @ARGV) {
166         if($arg =~ /--image|img/i) {$imgsupport=1;}
167         if($arg =~ /--math/i) {$mathsupport=1;}
168         if($arg =~ /^\d+$/) {$skipfirst=$arg;}
169         if($arg =~ /--download/i) {$downloadsupport=1;}
170         if($arg =~ /--help/i) {die "$usage\n";}
171         if($arg =~ /--version/i) {die "$revisions\n";}
172         if($arg =~ /--cat(egor(y|ies))?/i) {$catsupport=1;}
173 }
174
175 if($rootpath =~ m/^([\w\-]+)wiki\-.*\.xml\.bz2$/i) {
176         $masterfile=$rootpath;
177         $rootpath=$1;
178 }
179
180 $archivepath="$rootpath/bz2";
181 $indexpath="$rootpath/db";
182 $imgpath="$rootpath/img";
183 $mathpath="$rootpath/math";
184
185 MakeDirs($rootpath);
186 MakeDirs($archivepath);
187 MakeDirs($indexpath);
188
189 if(-e $masterfile) {
190         move($masterfile, $archivepath);
191 }
192
193 if($imgsupport>0) {
194         mkdir "$imgpath";
195         use Digest::MD5 qw(md5_hex);
196         use HTTP::Request;
197         use LWP::UserAgent;
198         $ua = new LWP::UserAgent;
199         $ua->agent("iPhonePedia");
200 }
201
202 if($mathsupport>0) {
203         mkdir "$mathpath";
204         use Digest::MD5 qw(md5_hex);
205 }
206
207 $starttime=time;
208 @Titles=();
209 $pattern="*.xml.bz2";
210
211 $indexfile="Index.dat";
212 $bzip2recovercmd="bzip2recover";
213 $bzcatcmd="bzcat";
214
215 $imagecount=0;
216 $failedimagecount=0;
217 $foundimagecount=0;
218 $equationcount=0;
219 $foundequationcount=0;
220
221
222
223
224 ###########################
225 #
226 #  SANITY
227 #
228 ###########################
229
230 if(not `$sortcmd --help 2>&1` =~ /usage/i) {print "Could not find correct external sort.\nFalling back on internal algorithm (slower).\n"; $sortmethod=3;}
231 if($mathsupport and not (`$dvipscmd 2>&1` =~ m/usage/i && `$latexcmd --help 2>&1`=~ m/usage/i)) {die "Math support requires latex and dvipng.\nPlease install a LaTeX distribution to get those.\n";}
232 if(not `$bzcatcmd --help 2>&1` =~ /usage/i) {die "Cannot find bzcat (tried with $bzcatcmd).\n";}
233 if($downloadsupport and not `$curlcmd --help 2>&1` =~ /usage/i) {die "Curl is required for download support.\nPlease install it from http://curl.haxx.se\n";}
234
235 print "\n";
236 print "== Root path:     $rootpath\n";
237 print "== Archives path: $archivepath\n";
238 print "== Index path:    $indexpath\n";
239 print "== Image path:    $imgpath\n"  if($imgsupport);
240 print "== Math path:     $mathpath\n" if($mathsupport);
241 print "\n";
242
243
244 ###########################
245 #
246 #  SPLIT ARCHIVE IF NEEDED
247 #
248 ###########################
249
250 @list = glob("$archivepath/$pattern");
251
252 if(@list == 0) {
253         $downloadurl =~ s/LANG/$rootpath/g;
254         $downloadname =~ s/LANG/$rootpath/g;
255
256         if($downloadsupport) {
257                 print "No archive found. Attempting to download it from wikimedia.\n";
258                 `curl $downloadurl -o \"$archivepath/$downloadname\"`;
259                 if(! -e "$archivepath/$downloadname") {die "Could not download archive.\n$downloadurl does not seem to exist. Please download manually from http://download.wikimedia.org\n";}
260                 @list = glob("$archivepath/$pattern");
261         }
262         else {
263                 die "Could not find any archive. Please make sure either the wikipedia dump or the splitted files are in $archivepath\nYou may want to download $downloadurl\n";
264         }
265 }
266
267 $downloadtime=int((time - $starttime)/60);
268
269 if(@list == 1 && not $list[0] =~ /rec\d+/i) {
270         $bigfile=$list[0];
271         print "Only one archive found ($bigfile). Splitting.\n";
272
273         print `$bzip2recovercmd $bigfile`;
274         @list = glob("$archivepath/$pattern");
275         if(@list > 1) {
276                 print "Success!\n";
277                 move($bigfile,".");
278         }
279 }
280
281 $archivetime=int((time - $starttime)/60);
282
283 $allfiles=@list;
284
285
286
287 ###########################
288 #
289 #  SCAN ARCHIVES
290 #
291 ###########################
292
293 open OUTFILE,">$indexfile";
294 foreach $bzfile (@list) {
295         next if ($bzfile eq $bigfile);
296                 
297         if($bzfile =~ m/rec(\d+)(\D+)wiki/) {
298                 $idx=int($1);
299                 next if($idx<$skipfirst && $idx!=1);
300                 
301                 print "Scanning file number $idx/$allfiles\n";
302                 $content= `$bzcatcmd $bzfile`;
303                 #print "$content";
304
305                 if($idx==1) {
306                         #extract namespace information
307                         while($content =~ m|<namespace key="([\d\-]*)">(.*?)</namespace>|ig) {
308                                 unless($namespaces{$1}==1) {
309                                         $ignorenamespaces{lc $2}=1;
310                                         print "Ignoring namespace $2\n";
311                                 }
312                                 if($1==6) {
313                                         #save localized image namespace name
314                                         $imgnamespace=$2;
315                                 }
316                                 if($1==14) {
317                                         #save category namespace name
318                                         $catnamespace=$2;
319                                 }
320                         }
321                         if($imgnamespace) {
322                                 print "Using '$imgnamespace:' namespace for images\n";
323                         }
324                         if($catnamespace) {
325                                 print "Using '$catnamespace:' namespace for categories\n";
326                         }
327                 }
328
329                 while($content =~ m|<title>([^<]+)</title>|g) {
330                         $title=$1;
331                         $offset=pos($content)-length("<title>$title</title>");
332                         $skip=0;
333                         foreach $ns (keys %ignorenamespaces) {if($title =~ /^$ns\:/i) {$skip=1;}}
334                         next if($skip);
335                         
336                         #$title=decode("utf8", $title);
337                         #$title=PlainAscii($title);
338                                 
339                         #print "$title#$idx\n";
340                                 
341                         if($sortmethod==1) {
342                                 print OUTFILE "$title#$idx-$offset\n";
343                         }
344                         elsif($sortmethod==2) {
345                                 push(@Titles, "$title#$idx-$offset");
346                         }
347                         elsif($sortmethod==3) {
348                                 print OUTFILE "$title#$idx-$offset\n";
349                                 #InsertSorted("$title#$idx-$offset");
350                         }
351                         
352                 }
353
354
355                 if($imgsupport>0) {
356                         while($content =~ m/(\[|^|\=)\s*(image|$imgnamespace)\:(.*?)(\]|$)/mgi) {
357                                 $foundimagecount++;
358                                 $descr=$3;
359                                 if($descr=~ m/(\d+)px/) {$width=$1;}
360                                 else {$width=$imgsize;}
361                                 $descr =~ s/\|.*//;
362                                 $descr =~ s/\s+$//;
363                                 $descr =~ s/^\s+//;
364                                 
365                                 #skip past attempts for inexistent images
366                                 next if($failedimg{$descr}==1);
367                                 
368                                 print " (".GetProgress().")-Getting image #$imagecount: ";
369                                 $returned=ProcessImage($descr,$width);
370
371                                 if($returned==1) {
372                                         print "ok\n";
373                                         $imagecount++;
374                                 }
375                                 elsif($returned==2) {
376                                         print "already downloaded\n";
377                                 }
378                                 else {
379                                         print "failed\n";
380                                         $failedimg{$descr}=1;
381                                         $failedimagecount++;
382                                 }
383                         }
384                 }
385                 
386                 if($mathsupport>0) {
387                         while($content =~ m|\&lt;math\&gt;\s*(.*?)\s*\&lt;/math\&gt;|mgi) {
388                                 print " (".GetProgress().")-Rendering equation #$equationcount: ";
389                                 ProcessEquation($1);
390                         }
391                 }
392
393                 if($catsupport>0) {
394                         while($content =~ m|\[\[$catnamespace\:\s*([^\]]*?)\s*\]\]|mgi) {
395                                 $cat=$1;
396                                 $cat =~ s/[\n\|].*//gi;
397                                 $start=rindex($content, "<title>", pos($content));
398                                 $end=index($content, "</title>", $start);
399                                 if($start>0 && $end>0) {
400                                         $title=substr($content, $start+7, $end-$start-7);
401                                         #print " Adding '$title' to category '$cat'\n";
402                                         push @{$catmembers{$cat}}, $title;
403                                 }
404                         }
405                 }
406         }
407 }
408
409
410 $exploretime=int((time - $starttime)/60);
411
412 if($imgsupport) {
413         print "== Packing image dir:\n";
414         PackImgDirectory();
415
416         print "== Image stats:\n";
417         print "  Detected  : $foundimagecount\n";
418         print "  Downloaded: $imagecount\n";
419         print "  Failed    : $failedimagecount\n";
420 }
421
422 if($mathsupport) {
423         print "== Packing math dir:\n";
424         PackMathDirectory();
425         
426         print "== Math stats:\n";
427         print "  Found     : $foundequationcount\n";
428         print "  Generated : $equationcount\n";
429 }
430
431 if($catsupport) {
432         print "== Dumping categories:\n";
433         Dumpcat();
434         
435         print "== Categories:\n";
436         print "  Found     : ".(keys %catmembers)."\n";
437 }
438
439
440
441 ###########################
442 #
443 #  SORT Index.dat
444 #
445 ###########################
446
447 print "== Sorting\n";
448
449 $locale=$ENV{"LC_ALL"};
450 $ENV{"LC_ALL"}="C";
451
452 if($sortmethod==1) {
453         close OUTFILE;
454         print `$sortcmd -f -o \"$indexfile\" \"$indexfile\"`;
455         #other options -t \'#\' -k 1,1
456 }
457 elsif($sortmethod==2) {
458         merge_sort(\@Titles, 0, scalar(@Titles)-1);
459         foreach $elt (@Titles) {
460                 print OUTFILE "$elt\n";
461         }
462         close OUTFILE;
463 }
464 elsif($sortmethod==3) {
465         close OUTFILE;
466         open OUTFILE,"<$indexfile";
467         while(<OUTFILE>) {
468                 chomp;
469                 InsertSorted("$_");
470         }
471         close OUTFILE;
472         open OUTFILE,">$indexfile";
473         foreach $elt (@Titles) {
474                 print OUTFILE "$elt\n";
475         }
476         close OUTFILE;
477 }
478
479 $ENV{"LC_ALL"}=$locale;
480
481
482 $sorttime=int((time - $starttime)/60);
483
484
485
486 ###########################
487 #
488 #  BUILD FULL-TEXT INDEX
489 #
490 ###########################
491
492 # Path to index database files
493 $HASH      = "${indexpath}/0_hash";
494 $HASHWORDS = "${indexpath}/0_hashwords";
495 $FINFO     = "${indexpath}/0_finfo";
496 $SITEWORDS = "${indexpath}/0_sitewords";
497 $WORD_IND  = "${indexpath}/0_word_ind";
498
499
500
501 #minimum word length to index
502 $min_length = 3;
503
504 # Index or not numbers (set $numbers = "" if you don't want to index numbers)
505 # You may add here other non-letter characters, which you want to index
506 $numbers = '0-9';
507
508 # Indexing scheme
509 # Whole word - 1
510 # Beginning of the word - 2
511 # Every substring - 3
512 $INDEXING_SCHEME = 1;
513
514 # List of stopwords
515 $stop_words = "and any are but can had has have her here him his how its not our out per she some than that the their them then there these they was were what you";
516
517
518 $HASHSIZE = 300001;
519
520
521 @stop_words=split(/\s+/,$stop_words);
522 foreach $stopword (@stop_words) {$stop_words_array{$stopword}=1; }
523
524
525 print "== Start indexing\n";
526
527
528 #DEFINE CONSTANTS
529 $cfn = 0;
530 $cwn = 0;
531
532 if(! -d "db") {
533         mkdir("db",0755) or die("Can't create directory DB!!!");
534         print "== Directory 'db' has been created\n";
535 }
536
537
538
539 mkdir($indexpath);
540 open(fp_FINFO,">$FINFO") or die("Can't open index file!\n");
541 open(fp_SITEWORDS ,">$SITEWORDS") or die("Can't open index file!\n");
542 open(fp_WORD_IND,">$WORD_IND") or die("Can't open index file!\n");
543
544 binmode fp_FINFO;
545 binmode fp_SITEWORDS;
546 binmode fp_WORD_IND;
547
548 print fp_FINFO "\x0A";
549
550 scan_list("$indexfile");
551
552 if ($cfn == 0) {
553     die "No files are indexed\n";
554 }
555
556 print "== Computing word hash\n";
557     $pos_sitewords = tell(fp_SITEWORDS);
558     $pos_word_ind  = tell(fp_WORD_IND);
559     $to_print_sitewords = "";
560     $to_print_word_ind  = "";
561     foreach $word (keys %words) {
562         $value=$words{$word};
563         $cwn++;
564         $words_word_dum = pack("NN",$pos_sitewords+length($to_print_sitewords),
565                                 $pos_word_ind+length($to_print_word_ind));
566         $to_print_sitewords .= "$word\x0A";
567         $to_print_word_ind .= pack("N",length($value)/4).$value;
568         $words{$word} = $words_word_dum;
569        
570        
571         if (length($to_print_word_ind) > 32000) {
572             print fp_SITEWORDS $to_print_sitewords;
573             print fp_WORD_IND  $to_print_word_ind;
574             $to_print_sitewords = "";
575             $to_print_word_ind  = "";
576             $pos_sitewords = tell(fp_SITEWORDS);
577             $pos_word_ind  = tell(fp_WORD_IND);
578         }
579
580     }
581     print fp_SITEWORDS $to_print_sitewords;
582     print fp_WORD_IND  $to_print_word_ind;
583
584 close(fp_SITEWORDS);
585 close(fp_WORD_IND);
586
587 $indextime=int((time - $starttime)/60);
588
589 print "== Dumping hash\n";
590
591 build_hash();
592
593 print "== $cfn entries indexed\n";
594
595
596 $dumptime=int((time  - $starttime)/60);
597
598
599 ###########################
600 #
601 #  CLEANUP & FINAL STATS
602 #
603 ###########################
604 unlink <temp.*>;
605 unlink $indexfile;
606
607 print "\n";
608 print "- Download time: ${downloadtime}m\n" if($downloadsupport);
609 print "- Archive time:  ${archivetime}m\n";
610 print "- Explore time:  ${exploretime}m\n";
611 print "- Sort time:     ${sorttime}m\n";
612 print "- Index time:    ${indextime}m\n";
613 print "- Dump time:     ${dumptime}m\n";
614
615
616
617
618
619
620
621
622
623
624
625
626
627 #=====================================================================
628 #
629 #    Function risearch_hash($key)
630 #    Last modified: 16.04.2004 17:54
631 #
632 #=====================================================================
633
634 sub risearch_hash {
635         my ($key)=@_;
636     @chars = split(//,$key);
637     for($i=0;$i<@chars;$i++) {
638         $chars2[$i] = ord($chars[$i]);
639     }
640
641     $h = hex("00000000");
642     $f = hex("0F000000");
643
644     for($i=0;$i<@chars;$i++) {
645                 $h = ($h << 4) + $chars2[$i];
646         if ($g = $h & $f) { $h ^= $g >> 24; };
647         $h &= ~$g;
648     }
649
650     return $h;
651
652 }
653
654 #=====================================================================
655 #
656 #    Function index_file($html_text,$url)
657 #    Last modified: 15.07.2004 11:35
658 #
659 #=====================================================================
660
661 sub index_title {
662         my ($textindex,$url) = @_;
663         my %words_temp;
664
665
666         $cfn++;
667
668         #decode UTF8
669         $textindex = decode_utf8($textindex);
670
671         $textindex = RemoveHTMLentities($textindex);
672         #$textindex =~ s/[^a-zA-Z�-��-�$numbers -]/ /g;
673         $textindex =~ s/[^\w\d -]/ /g;
674         $textindex =~ s/\s+/ /g;
675         #$textindex = lc($textindex);
676         $textindex = PlainAscii($textindex);
677
678         #back to binary
679         $textindex = encode_utf8($textindex);
680         @words_temp=split(/\s+/,$textindex);
681
682         $pos = tell(fp_FINFO);
683         $pos = pack("N",$pos);
684         print fp_FINFO "$url\x0A";
685    
686
687         foreach $word (@words_temp) {
688                 next if (length($word) < $min_length);
689                 next if ($stop_words_array{$word});
690                 $words{$word} .= $pos;
691        
692                 #print "$word => ".$words{$word}."\n";
693         }
694 }
695
696 #=====================================================================
697 #
698 #    Function build_hash()
699 #    Last modified: 16.04.2004 17:54
700 #
701 #=====================================================================
702
703 sub build_hash {
704
705     for ($i=0; $i<$HASHSIZE; $i++) {$hash_array[$i] = "";};
706
707     foreach $word (keys %words) {
708         $value=$words{$word};
709         if ($INDEXING_SCHEME == 3) { $subbound = length($word)-3; }
710         else { $subbound = 1; }
711         if (length($word)==3) {$subbound = 1;}
712         $substring_length = 4;
713         if ($INDEXING_SCHEME == 1) { $substring_length = length($word); }
714
715         for ($i=0; $i<$subbound; $i++){
716             $hash_value = abs(risearch_hash(substr($word,$i,$substring_length)) % $HASHSIZE);
717             $hash_array[$hash_value] .= $value;
718          }
719
720     }
721
722     open(fp_HASH, ">$HASH") or die("Can't open index file!");
723     open(fp_HASHWORDS,">$HASHWORDS") or die("Can't open index file!");
724
725         binmode fp_HASH;
726         binmode fp_HASHWORDS;
727        
728     $zzz = pack("N", 0);
729     print fp_HASHWORDS $zzz;
730     $pos_hashwords = tell(fp_HASHWORDS);
731     $to_print_hash = "";
732     $to_print_hashwords = "";
733
734     for ($i=0; $i<$HASHSIZE; $i++){
735         $elt=$hash_array[$i];
736         if ($elt eq "") {$to_print_hash .= $zzz;}
737         else {
738             $to_print_hash .= pack("N",$pos_hashwords + length($to_print_hashwords));
739             $to_print_hashwords .= pack("N", length($elt)/8).$elt;
740            
741         }
742         if (length($to_print_hashwords) > 64000) {
743             print fp_HASH $to_print_hash;
744             print fp_HASHWORDS $to_print_hashwords;
745             $to_print_hash = "";
746             $to_print_hashwords = "";
747             $pos_hashwords  = tell(fp_HASHWORDS);
748         }
749     }
750     print fp_HASH $to_print_hash;
751     print fp_HASHWORDS $to_print_hashwords;
752
753
754 close(fp_HASH);
755 close(fp_HASHWORDS);
756
757
758 }
759 #=====================================================================
760
761
762
763
764 #=====================================================================
765 #
766 #    Function scan_files ($dir)
767 #    Last modified: 05.04.2005 16:41
768 #
769 #=====================================================================
770
771 sub scan_list {
772         my ($dbfile)=@_;
773
774         print "== Scanning $dbfile\n";
775         open(FILE,$dbfile) or print "Cannot open $dbfile\n";
776
777         while(<FILE>) {
778                 $line=$_;
779                 chomp $line;
780                 if($line =~ /^(.*)#.*/) {
781                         #print "indexing entry: $line";
782                         index_title($1,$line);
783                 }
784         }
785         close(FILE);
786 }
787
788 sub RemoveHTMLentities {
789     my ($text) = @_;
790     my (%entities, $key, $subst);
791        
792     %entities =  (  "&amp;"     =>  "&",
793                     "&ndash;"   =>  "-",
794                     "&lt;"      =>  "<",
795                     "&gt;"      =>  ">",
796                     "&quote;"   =>  "\"",
797                     "&quot;"    =>  "\'"
798                  );
799    
800     foreach $key (keys %entities)
801         {
802         $subst = $entities{$key};
803         $text =~ s/$key/$subst/g;
804         }
805        
806     return $text;
807 }
808
809 sub GetMediaHash {
810     my ($medianame) = @_;
811     my ($md5, $path, $fileprefix);
812
813     # First-capitalize Unpack %xx hex-encoded characters, and convert resulting spaces in "_"
814     $medianame = ucfirst $medianame;
815     $medianame =~ s/%(..)/pack("c",hex($1))/ge;
816     $medianame =~ s/\s/_/g;
817    
818     # Remove <> signs, on some systems we cannot write files with them
819     $medianame =~ s/\>/_/g;
820     $medianame =~ s/\</_/g;
821     # replace remaining unicase (if there are errors in the media names)
822     #$medianame =~ s/[^\p{Latin}\p{NP}]/_/g;
823
824     #print "$medianame -"; 
825     #use Encode 'from_to';
826     #from_to($medianame,"utf-8","iso-8859-15");
827     #$medianame = pack("C*",unpack("U*", $medianame));
828     $md5 = md5_hex($medianame);
829
830     return $md5;
831 }
832
833 sub ProcessImage {
834         my ($title,$width)=@_;
835         my ($path, $onlinepath, $outpath, $outfile, $langpath, $ext);
836        
837         #reject unsupported extensions
838         if($title =~ m/\.([^\.]+)$/i) {$ext=lc $1;}
839