# ODP::Category - An ODP public page parsing class (Version 0.01) # Copyright (C) 2002 - 2004 Richard P. Fuller # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package ODP::Category; use strict; use LWP::Simple; use URI::Escape; use ODP::Site; use ODP::Search; # new - Initialises a new ODP::Category object # Parameters: category # Returns: ODP::Category object sub new ($) { my $object = {}; if (!defined $_[1]) { die 'No category supplied.'; } $object->{'category'} = normalise($_[1]); return bless $object; } # fetch - Fetches the public page and stores the source in the object # Parameters: # Returns: sub fetch { my $self = shift; if (!$self->{'category'}) { # Make a fake page for Top/ foreach my $cat ('Arts','Business','Computers','Games','Health','Home','Kids_and_Teens','News','Recreation','Reference','Regional','Science','Shopping','Society','Sports','World') { $self->{'content'} .= "
  • $cat\n  (0)"; } return; } $self->{'content'} = LWP::Simple::get("http://dmoz.org/$self->{'category'}"); } # convert - Converts the character set of the entire category # Parameters: from charset, to charset # Returns: sub convert($$) { my $self = shift; my ($from, $to) = @_; if (!$self->{'content'}) { $self->fetch; } require Text::Iconv; Text::Iconv->import(); my $converter = Text::Iconv->new($from, $to); $self->{'content'} = $converter->convert($self->{'content'}); } # exists - Returns whether or not a category exists # Parameters: # Returns: Whether or not the category exists (boolean) sub exists { my $self = shift; if (!$self->{'content'}) { $self->fetch; } if (!$self->{'content'}) { return 0; } else { return 1; } } # subcats - Returns an array of subcategory objects # Parameters: # Returns: array of subcategories (array of ODP::Category objects) sub subcats { my $self = shift; if (!$self->{'content'}) { $self->fetch; } my $page = $self->{'content'}; my $page2 = $page; my @subcats = (); while ($page =~ s/
  • ([^\<]+)<\/b><\/a>\n  \([^\)]+\)<\/i>//) { my ($path,$cat)=($1,$2); push @subcats, new ODP::Category($self->{'category'}."/".$cat); } while ($page2 =~ s/
  • ([^\<]+)<\/b><\/a> \([^\)]+\)//) { my ($path,$cat)=($1,$2); push @subcats, new ODP::Category($self->{'category'}."/".$cat); } while ($page2 =~ s/(.)<\/b><\/a>//) { my ($path,$cat)=($1,$2); if ($path eq "$self->{'category'}/$cat/") # If this is a real subcategory, not an alphabar sideways link { push @subcats, new ODP::Category($self->{'category'}."/".$cat); } } return @subcats; } # subcats_plain - Returns an array of subcategories # Parameters: # Returns: array of subcategories (array of strings) sub subcats_plain { my $self = shift; if (!$self->{'content'}) { $self->fetch; } my $page = $self->{'content'}; my $page2 = $page; my @subcats = (); while ($page =~ s/
  • ([^\<]+)<\/b><\/a>\n  \([^\)]+\)<\/i>//) { my ($path,$cat)=($1,$2); push @subcats, normalise($cat); } while ($page2 =~ s/
  • ([^\<]+)<\/b><\/a> \([^\)]+\)//) { my ($path,$cat)=($1,$2); push @subcats, normalise($cat); } while ($page2 =~ s/(.)<\/b><\/a>//) { my ($path,$cat)=($1,$2); if ($path eq "$self->{'category'}/$cat/") # If this is a real subcategory, not an alphabar sideways link { push @subcats, normalise($cat); } } return @subcats; } # newsgroups - Returns an array containing a list of newsgroups # Parameters: # Returns: array of newsgroups (array of strings) sub newsgroups { my $self = shift; if (!$self->{'content'}) { $self->fetch; } my $page = $self->{'content'}; my @newsgroups = (); while ($page =~ s/
  • Usenet ([^ ]+) - news:<\/a> - Google Groups<\/a><\/small>//) { push @newsgroups, $1; } return @newsgroups; } # encoding - Returns the encoding used for the category (text) # Parameters: # Returns: encoding (string) sub encoding { my $self = shift; if (!$self->{'content'}) { $self->fetch(); if ($self->{'content'} =~ m||i) { return $1; } else { return ''; } } } # altlangs - Returns an array of altlangs (category objects) # Parameters: # Returns: array of altlangs (array of ODP::Category objects) sub altlangs { my $self = shift; if (!$self->{'content'}) { $self->fetch; } my $page = $self->{'content'}; my @altlangs = (); if ($page =~ /<\/td><\/tr>