#!/usr/bin/perl -w use strict; use LWP::UserAgent; use HTML::Entities; use HTML::Parser (); use Text::Iconv; use Encode qw(encode decode); use Data::Dumper; use locale; our ( @P, @Div, $This_Uri, $This_Image_Retrieved ); sub urlize($) { my ( $uri, $last_uri ) = shift; if($uri !~ "^http://") { if($uri =~ "^/") { $uri = "http://www.flickr.com$uri"; } else { $uri = $This_Uri/$uri; } } return $uri; } sub start($%) { my ( $tagname, $attr ) = @_; if( $tagname eq 'p' ) { if( defined $attr->{class} ) { push @P, $attr->{class}; #print "+ " . $attr->{class} . "\n"; } } elsif( $tagname eq 'div' ) { if( defined $attr->{id} ) { #print $attr->{id} . "\n"; push @Div, $attr->{id}; } elsif( defined $attr->{class} ) { #print $attr->{class} . "\n"; push @Div, $attr->{class}; } } elsif( $tagname eq 'a' ) { #@P and print $P[-1] . "\n" . Dumper($attr); if( @P and ( $P[-1] eq 'Photo' or $P[-1] eq 'PoolList' ) ) { if( defined $attr->{href} and not $This_Image_Retrieved) { #print 'page: ' . $attr->{href} . "\n"; # FIX: find a better solution $This_Image_Retrieved = zoom_image($attr->{href}); } } elsif( @Div and $Div[-1] eq 'setThumbs' ) { if( defined $attr->{href} ) { #print 'page: ' . $attr->{href} . "\n"; zoom_image($attr->{href}); } } elsif( @Div and $Div[-1] eq 'Paginator' ) { if( defined $attr->{class} and $attr->{class} eq 'Next' and defined $attr->{href} ) { #print 'next: ' . $attr->{href} . "\n"; get_page( urlize( $attr->{href} ) ); } } } } sub end($%) { my ( $tagname, $attr ) = @_; if( $tagname eq 'p' ) { if( my $class = pop @P ) { #print "- " . $class . "\n"; $This_Image_Retrieved = 0; } } if( $tagname eq 'div' ) { if( my $class = pop @Div ) { #print "- " . $class . "\n"; } } } sub get_page($) { my $uri = shift; #print "fetching index $uri\n"; $This_Uri = $uri; my $ua = new LWP::UserAgent; $ua->timeout(10); $ua->env_proxy; #print "$uri\n"; my $res = $ua->get($uri); unless($res->is_success) { return 1; } my $page = $res->content; #my $page = `links -source $uri`; # Create parser object my $p = HTML::Parser->new( api_version => 3, start_h => [\&start, "tagname, attr"], end_h => [\&end, "tagname"], marked_sections => 1, ); $p->utf8_mode(1); $p->parse($page); $p->eof; } sub download_image($$) { my ( $img, $title ) = @_; my $filename = ""; if( $title ) { #print "title: $title\n"; $filename = $title; $filename =~ s/\xA0/ /g; $filename =~ y/ÀÁÂÃÄÅÆÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿ/AAAAAAACDEEEEIIIINOOOOOOUUUUYaaaaaaaceeeeiiiinoooooouuuuyy/; $filename =~ s/ß/ss/g; $filename =~ s/ð/th/g; #print "decoded: $filename\n"; if( $filename =~ /\S/ ) { $filename =~ s/\s/_/g; $filename =~ s/[^a-zA-Z0-9_\-.]//g; #$filename =~ s/[\'\"\&\|\#\$\%\(\)\!]//g; $filename =~ s/^\s+//; $filename =~ s/\s+$//; #print "|$filename|\n"; if( $filename =~ /\S/ ) { if ( -f "$filename.jpg" ) { my $findex = 1; while( -f ( sprintf "%s.%03d.jpg", $filename, $findex ) ) { $findex++; } $filename = sprintf "%s.%03d", $filename, $findex; } $filename .= ".jpg"; } #print "final: $filename\n"; } else { $filename = ""; } } if( $filename =~ /\S/ ) { print `wget --quiet "$img" -O $filename`; } else { # FIX remove ?v=0 print `wget --quiet -c "$img"`; } } sub get_image($$) { my ( $uri, $title ) = @_; my $ua = new LWP::UserAgent; $ua->timeout(10); $ua->env_proxy; my $res = $ua->get($uri); my $retries = 2; while( $retries-- and not $res->is_success) { $res = $ua->get($uri); } unless( $res->is_success) { warn "Unable to fetch $uri"; return 0; } my $image_found; foreach my $row ( split /\n/, decode( "utf8", $res->content ) ) { #if( $row =~ /