wget.pl

829 views

Published on

Published in: Technology
0 Comments
2 Likes
Statistics
Notes
  • Be the first to comment

No Downloads
Views
Total views
829
On SlideShare
0
From Embeds
0
Number of Embeds
2
Actions
Shares
0
Downloads
4
Comments
0
Likes
2
Embeds 0
No embeds

No notes for slide

wget.pl

  1. 1. All YOUR PAGE ARE BELONG TO US すべてのウェブページをこの手に 2012/11/16 株式会社はてな 大西康裕 id:onishi
  2. 2.   id:onishi 大西康裕  ONISHI  @yasuhiro_onishi  株式会社はてな  はてなブログ
  3. 3. ウェブページを保存したい
  4. 4. ウェブページを保存したい •ウェブページは日々変化する •手元に置いておきたい •競合調査 • 魚拓 •画像などまとめて保存したい
  5. 5. GoogleChrome
  6. 6. HTML::Parsermy $result;my $parser = HTML::Parser->new( start_h => [ sub {}, self,tagname,attr,text ], default_h => [ sub {}, self,text ],);$parser->parse($content);print $result; • text • start • end • process • declaration • comment • default
  7. 7. HTML::Parserstart_h => [ sub { my($self, $tagname, $attr, $text) = @_; $result .= "<$tagname"; for my $key (sort keys %$attr) { my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { # HTTP GET して保存してローカルパスにする $value = get_src($value); } $result .= qq{ $key="$value"}; } $result .= ">"; }, self,tagname,attr,text,],
  8. 8. HTML::Parserdefault_h => [ sub { my($self, $text) = @_; $result .= $text; }, self,text,],
  9. 9.
  10. 10. CSSから参照$content =~ s{url(([^)]+))}{ my $link = $1; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); # HTTP GET して保存してローカルパスに "url($link)";}eg;
  11. 11. script 殺すmy $context = { disallow => 0 };my $disallow_tag = qr{script};start_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; }}],end_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; }}],default_h => [sub { if ($context->{disallow} > 0) { return; }}],
  12. 12. noscript 内を生かすmy $nodisplay_tag = qr{noscript};start_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; }}],end_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; }}],
  13. 13. basestart_h => [sub { if ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = "./"; }}],
  14. 14. できました!gist.github.com/ 4071196
  15. 15. #!/usr/bin/env perluse strict;use warnings;use utf8;use DateTime;use Digest::SHA1 qw(sha1_hex);use Encode;use File::Path qw/make_path/;use HTML::Parser;use HTML::ResolveLink;use HTTP::Request::Common qw/GET/;use IO::All;use LWP::UserAgent;use URI;my $path = ./;my $uri = URI->new(shift) or die;my $now = DateTime->now;my $ymd = $now->ymd;my $ua = LWP::UserAgent->new(agent => Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0));my $resolver = HTML::ResolveLink->new(base => $uri);my $res = $ua->request(GET $uri);my $content = $resolver->resolve($res->decoded_content);my $dir = $uri; $dir =~ s{[^A-Za-z0-9.]+}{-}g; $dir =~ s{-+$}{}; $dir = "$path/$dir/$ymd/"; $dir =~ s{/+}{/}g;make_path($dir);my $disallow_tag = qr{script};my $nodisplay_tag = qr{noscript};my $result;my $context = { disallow => 0 };my $parser = HTML::Parser->new( api_version => 3, start_h => [ sub { my($self, $tagname, $attr, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } $result .= "<$tagname"; for my $key (sort keys %$attr) { $key eq / and next; my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { $value = get_src($value); } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) { $value = get_link($value); } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = $path; } $result .= qq{ $key="$value"}; } $result .= ">"; }, self,tagname,attr,text, ], end_h => [ sub { my($self, $tagname, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } $result .= $text; }, self,tagname,text, ], default_h => [ sub { my($self, $text) = @_; if ($context->{disallow} > 0) { return; } $result .= $text; }, self,text, ],);$parser->parse($content);$result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX$result = Encode::encode(utf-8, $result);$result > io("${dir}index.html");print "${dir}index.htmln";sub get_src { my $src = shift or return; unless (-e "${dir}file") { make_path("${dir}file"); } my $file = $src; $file =~ s{[^A-Za-z0-9.]+}{-}g; if (length($file) > 255) { $file = sha1_hex($file); } $file = "file/$file"; $file =~ s{/+}{/}g; unless (-e "$dir$file") { $ua->request(GET $src)->content >> io("$dir$file"); sleep(1); # DOS対策対策 } $file;}sub get_link { my $url = shift or return; my $file = get_src($url); my $io = io("$dir$file"); my $content = $io->slurp; $content =~ s{url(([^)]+))}{ my $link = $1; $link =~ s{^[s"]+}{}; $link =~ s{[s"]+$}{}; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); $link =~ s{^file/}{}; "url($link)"; }eg; $content > $io; return $file;}
  16. 16. GoogleChrome
  17. 17. wget.pl
  18. 18. どうぞご利用ください!gist.github.com/ 4071196
  19. 19. ご清聴ありがとうございました

×