#!/usr/bin/perl -w use strict; use LWP::UserAgent; use HTML::Entities; use HTML::Parser (); use Text::Iconv; use Encode qw(encode decode); use Data::Dumper; use locale; our ( @P, @Div, $This_Uri, $This_Image_Retrieved ); sub urlize($) { my ( $uri, $last_uri ) = shift; if($uri !~ "^http://") { if($uri =~ "^/") { $uri = "http://www.flickr.com$uri"; } else { $uri = $This_Uri/$uri; } } return $uri; } sub start($%) { my ( $tagname, $attr ) = @_; if( $tagname eq 'p' ) { if( defined $attr->{class} ) { push @P, $attr->{class}; #print "+ " . $attr->{class} . "\n"; } } elsif( $tagname eq 'div' ) { if( defined $attr->{id} ) { #print $attr->{id} . "\n"; push @Div, $attr->{id}; } elsif( defined $attr->{class} ) { #print $attr->{class} . "\n"; push @Div, $attr->{class}; } } elsif( $tagname eq 'a' ) { #@P and print $P[-1] . "\n" . Dumper($attr); if( @P and ( $P[-1] eq 'Photo' or $P[-1] eq 'PoolList' ) ) { if( defined $attr->{href} and not $This_Image_Retrieved) { #print 'page: ' . $attr->{href} . "\n"; # FIX: find a better solution $This_Image_Retrieved = zoom_image($attr->{href}); } } elsif( @Div and $Div[-1] eq 'setThumbs' ) { if( defined $attr->{href} ) { #print 'page: ' . $attr->{href} . "\n"; zoom_image($attr->{href}); } } elsif( @Div and $Div[-1] eq 'Paginator' ) { if( defined $attr->{class} and $attr->{class} eq 'Next' and defined $attr->{href} ) { #print 'next: ' . $attr->{href} . "\n"; get_page( urlize( $attr->{href} ) ); } } } } sub end($%) { my ( $tagname, $attr ) = @_; if( $tagname eq 'p' ) { if( my $class = pop @P ) { #print "- " . $class . "\n"; $This_Image_Retrieved = 0; } } if( $tagname eq 'div' ) { if( my $class = pop @Div ) { #print "- " . $class . "\n"; } } } sub get_page($) { my $uri = shift; #print "fetching index $uri\n"; $This_Uri = $uri; my $ua = new LWP::UserAgent; $ua->timeout(10); $ua->env_proxy; #print "$uri\n"; my $res = $ua->get($uri); unless($res->is_success) { return 1; } my $page = $res->content; #my $page = `links -source $uri`; # Create parser object my $p = HTML::Parser->new( api_version => 3, start_h => [\&start, "tagname, attr"], end_h => [\&end, "tagname"], marked_sections => 1, ); $p->utf8_mode(1); $p->parse($page); $p->eof; } sub download_image($$) { my ( $img, $title ) = @_; my $filename = ""; if( $title ) { #print "title: $title\n"; $filename = $title; $filename =~ s/\xA0/ /g; $filename =~ y/ÀÁÂÃÄÅÆÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿ/AAAAAAACDEEEEIIIINOOOOOOUUUUYaaaaaaaceeeeiiiinoooooouuuuyy/; $filename =~ s/ß/ss/g; $filename =~ s/ð/th/g; #print "decoded: $filename\n"; if( $filename =~ /\S/ ) { $filename =~ s/\s/_/g; $filename =~ s/[^a-zA-Z0-9_\-.]//g; #$filename =~ s/[\'\"\&\|\#\$\%\(\)\!]//g; $filename =~ s/^\s+//; $filename =~ s/\s+$//; #print "|$filename|\n"; if( $filename =~ /\S/ ) { if ( -f "$filename.jpg" ) { my $findex = 1; while( -f ( sprintf "%s.%03d.jpg", $filename, $findex ) ) { $findex++; } $filename = sprintf "%s.%03d", $filename, $findex; } $filename .= ".jpg"; } #print "final: $filename\n"; } else { $filename = ""; } } if( $filename =~ /\S/ ) { print `wget --quiet "$img" -O $filename`; } else { # FIX remove ?v=0 print `wget --quiet -c "$img"`; } } sub get_image($$) { my ( $uri, $title ) = @_; my $ua = new LWP::UserAgent; $ua->timeout(10); $ua->env_proxy; my $res = $ua->get($uri); my $retries = 2; while( $retries-- and not $res->is_success) { $res = $ua->get($uri); } unless( $res->is_success) { warn "Unable to fetch $uri"; return 0; } my $image_found; foreach my $row ( split /\n/, decode( "utf8", $res->content ) ) { #if( $row =~ /]*>([^<]+)/ ) { # $title = $1; # print "$title\n"; #} if( $row =~ /img src="([^"]+)"/ ) { my $img = $1; # FIX: find a better method, use image id or something if( $img =~ /_[obm][.]jpg$/ ) { $image_found++; download_image($img, $title) } } } unless ( $image_found ) { warn "No image found in $uri"; return 0; } return 1; } sub zoom_image($) { my $uri = shift; my $title; my $ua = new LWP::UserAgent; $ua->timeout(10); $ua->env_proxy; my $res = $ua->get(urlize($uri)); unless($res->is_success) { warn "unable to fetch $uri"; return 1; } my ( $zoomable, $main_uri ); foreach my $row ( split /\n/, $res->content ) { if( $row =~ /]*>([^<]+)/ ) { my $string = $1; $title = decode_entities( decode( "utf8", $string, Encode::FB_QUIET ) ); #print length($title). " $title :$string:\n"; } if( $row =~ /\s*new( "utf8", "iso88591" ); foreach my $req_uri (@ARGV) { get_page( $req_uri ); }