#!/usr/bin/perl # RDFsearch.pl - ODP (http://dmoz.org) content.rdf searcher (codename craziness) - Version 1.1.2 # Copyright (C) 2001-2003 Richard P. Fuller # Supply an uncompressed RDF on stdin (e.g. cat content.rdf.u8.gz | gzip -d | RDFsearch.pl) # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # or visit http://research.dmoz.org/~rpfuller/src/GPL2.txt if (!-e 'results'){mkdir 'results', 0755} if (!-e 'results/all-bytree'){mkdir 'results/all-bytree', 0755} my $restrict = ''; my %searches = ( 11 => '!!!!.html', 12 => '*somethings*.html', 13 => '6dots.html', 14 => 'backslashes-in-urls.html', 15 => 'e-mail.html', 16 => 'over1000.html', 17 => 'over512.html', 18 => 'phone1.html', 19 => 'smileys.html', 20 => 'spaces-in-urls.html', 21 => 'commas-in-domainnames.html'); my @trees = ('Adult','Arts','Business','Computers','Games','Health','Home','Kids_and_Teens','News','Recreation','Reference','Regional','Science','Shopping','Society','Sports','World'); foreach my $search (keys %searches) { open ($search,">results/$searches{$search}"); print $search 'Content RDF Search'; } foreach my $cat (@trees) { open ($cat,">results/all-bytree/$cat.html"); print $cat 'Content RDF Search ('.$cat.')'; } my $cat; while (<>) { my $title; my $desc; my $ages; my $url; if ($_ =~ //) { $cat=$1; } if ($_ =~ //) { $title=''; $desc=''; $ages=''; $url=$1; my $t=''; while ($t ne ''."\n") { $t=<>; if ($t =~ /([^<]+)<\/d:Title>/) { $title=$1; } if ($t =~ /([^<]+)<\/d:Description>/) { $desc=$1; } if ($t =~ /([^<]+)<\/ages>/) { $ages=$1; } } # !!!! if ($desc =~ /\!\!\!\!/) { &write(11,$cat,$url,$title,$desc); } # *something* if ($desc =~ /\*[^\*]+\*/) { &write(12,$cat,$url,$title,$desc); } # ...... if ($desc =~ /\.\.\.\.\.\./) { &write(13,$cat,$url,$title,$desc); } # \ in URL if ($url =~ /^[^\?]+\\/) { &write(14,$cat,$url,$title,$desc); } # x@y.z if ($desc =~ /[^\s]+\@[^\.]+\./) { &write(15,$cat,$url,$title,$desc); } # Long (>512 and >1000) character descriptions if (length($desc) > 512) { &write(17,$cat,$url,$title,$desc); if (length($desc) > 1000) { &write(16,$cat,$url,$title,$desc); } } # Phone numbers if ($desc =~ /[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]/ || $desc =~ /\([0-9][0-9][0-9]\)\s*[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]/) { &write(18,$cat,$url,$title,$desc); } # Smileys if ($desc =~ /:-*[\(\)]/) { &write(19,$cat,$url,$title,$desc); } # Spaces in URLs (excluding after #) my $des_url = $url; while ($des_url =~ s/(.*)#(.*)/$1/){} if ($des_url =~ /\s/) { &write(20,$cat,$url,$title,$desc); } # Commas in domains if ($url =~ /http:\/\/[^,^\/]+,[^\/]+\//) { &write(21,$cat,$url,$title,$desc); } # Unused regexps you can add if you want # (from various requests in the past) #if ($url =~ /http:\/\/(www\.)*[^\.]+\.(com|net|org)\/Home\.html/) #if ($desc =~ /&/) #if ($url =~ /(\/|\.)web.com(\/|$)/) #if ($url =~ /\@/) #"search=" "search?" "query=" "query?" "qt=" and "qt?" #if ($url =~ /search=/ || $url =~ /search\?/ || $url =~ /query=/ || $url =~ /query\?/ || $url =~ /qt=/ || $url =~ /qt\?/) #if ($url =~ /\.jpe?g$/) #if ($url =~ /\.html?\/$/) #if ($cat=~/^World\/Chinese/ && $desc =~ / /) #(sorting bug) #if ($title =~ /^ /) #if ($cat !~ /^World/ && $cat !~ /Adult\/World/ && $cat !~ /News\/Online_Archives/ && $cat !~ /Business\/Opportunities\/Networking\-MLM/ && $cat !~ /Reference\/Encyclopedias/ && $desc && $desc !~ / /) #if ($desc =~ /\?/) #if ($url =~ /\@/) #if ($url !~ /^http:\/\//) #if ($url =~ /\.swf$/i) #if ($url =~ /\/\.\.?\//) #,com|net|org #if ($url =~ /\,(com|net|org)/ || $title =~ /\,(com|net|org)/) #$url =~ /http:\/\/.*\.([^\.]+)(\/$|$)/; #my $x=$1; #$x=~s/\/$//g; #if (length($x) > 3) #if ($desc =~ /\&/) #if ($desc =~ /\.[\.]+/) #if ($desc =~ /\~\*/ || $desc =~ /\*\~/ || $title =~ /\~\*/ || $title =~ /\*\~/) #if ($desc && $desc !~ /[a-z]+/ && $cat !~ /^World/ && $cat !~ /^Adult\/World/) #Description contains category name #my $lastbit = $cat; #$lastbit =~ s/.*\/([^\/]+)/$1/g; #if ($desc =~ /$lastbit/) #Description contains title #if ($desc =~ /$title/) } } foreach my $search (keys %searches) { print $search ''; } foreach my $cat (@trees) { print $cat ''; } exit; { my $lastcategory; sub write($$$$$) { my ($handle,$cat,$url,$title,$desc) = @_; if ($cat =~ m|^News/Online_Archives|){return;} # Ignore this tree my ($tree)=split(/\//,$cat); if ($lastcategory ne $cat) { print $handle "

$cat

"; print $tree "

$cat

"; $lastcategory=$cat; } print $handle "
  • [EDIT] $title - \n$desc\n\n"; print $tree "
  • [EDIT] $title - \n$desc [$searches{$handle}]\n\n"; } }