コンテンツにスキップ

英文维基 | 中文维基 | 日文维基 | 草榴社区

利用者:Bcxfubot/BOT作業依頼/log/20210222/chousa1/prog

#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use Encode;
use LWP::Simple;
use LWP::UserAgent;

#use LWP::Protocol::https
#binmode( STDOUT, ":utf8" );
binmode( STDOUT, ":encoding(UTF-8)" );

#use Jcode;
use Getopt::Std;
require 'subs_my.pl';
require 'all_check.pl';
require 'mini_check.pl';

# 標準出力のバッファリングを無効に
$| = 1;

our %opts      = ();
our $incomment = 0;

#------------------------------------------------------------------
# sub
#------------------------------------------------------------------
# multi byte 文字列として取り扱う
sub str_to_byte
{
    my( $sIn, $encoding ) = @_;
    my( $sOut, $len, $i );

    $sOut = '';

    $sIn = encode( $encoding, $sIn );

    $len = length( $sIn );
    for( $i=0; $i<$len; ++$i ){
        $sOut .= unpack( "H2", substr( $sIn, $i, 1 ) ) . ' ';
    }
    chop( $sOut );

    return $sOut;
}

#------------------------------------------------------------------
# main
#------------------------------------------------------------------
sub main {
    my $dumpfile;
    $dumpfile = $ARGV[0];
    print $dumpfile, "\n";

    #open DUMP, "<$dumpfile";
    #open( DUMP, "<:utf8", "$dumpfile" );
    open( DUMP, "<:encoding(UTF-8)", "$dumpfile" );

    #my $inpage = 0;
    #my $inblock = 0;
    my @page        = ();
    my $title       = "";
    my $id          = "";
    my $ns          = "";
    my $comment     = "";
    my $lineno      = 0;
    my $count       = 0;
    my $flag1       = 0;
    my $flag2       = 0;
    my $prevline    = "";
    my $prev2line    = "";
    my $inpage      = 0;
    my $inblock     = 0;
    my $passpage    = 0;
    my $gaibu       = 0;
    my $httpcount   = 0;
    my $httpscount  = 0;

    while (<DUMP>) {

        #print $_;
        if ( $inpage == 0 ) {
            if ( $_ =~ /<page>/ ) {
                $inpage = 1;
                push( @page, $_ );
                next;
            }
        }
        if ( defined $opts{'c'} ) {
            if ( $opts{'c'} == 1 ) {
                if ( $inpage == 1 ) {
                    push( @page, $_ );
                }
            }
        }
        if ( $_ =~ /<\/page>/ ) {
            if ( defined $opts{'c'} ) {
                if ( $opts{'c'} == 1 ) {
                    &all_check(@page);

                    #&mini_check(@page);
                }
            }

            $inpage  = 0;
            $inblock = 0;

            #$inkyaku = 0;
            #print @page, "\n";
            $lineno     = 0;
            $count      = 0;
            $flag1      = 0;
            $flag2      = 0;
            @page       = ();
            $incomment  = 0;
            $prevline   = "";
            $prev2line   = "";
            $passpage   = 0;
            $title      = "";
            $id         = "";
            $ns         = "";
            $comment    = "";
            $gaibu      = 0;
            next;
        }
        if ( $inpage == 0 ) {
            next;
        }
        if ( $title eq "" ) {
            if ( $_ =~ /<title>(.*)<\/title>/ ) {
                $title = $1;

                #print "title=[[$title]]\n";
                if ( &is_avoid_page($title) ) {
                    $passpage = 1;
                }

=comment
                if ( $title =~ /BOOWY/ ||
                    $title =~ /BOØWY/ ) {
                    $flag1 = 1;
                }
=cut
=comment
                if ( $title =~ /^Template:/ ) {
                    $flag1 = 1;
                }
=cut


                next;
            }
        }
        if ( $id eq "" ) {
            if ( $_ =~ /<id>(.*)<\/id>/ ) {
                $id = $1;
                next;
            }
        }
        if ( $ns eq "" ) {
            if ( $_ =~ /<ns>(.*)<\/ns>/ ) {
                $ns = $1;
                next;
            }
        }
        if ( $_ =~ /<comment>(.*)/ ) {
            $comment = $1;
            next;
        }
        if ( $_ =~ /<sha1>(.*)<\/sha1>/ ) {
            next;
        }
        if ( $_ =~ /(.*)<\/comment>/ ) {
            $comment = $1;
            next;
        }
        if ( $passpage == 1 ) {
            next;
        }
        if ( defined $opts{'c'} ) {
            if ( $opts{'c'} == 1 ) {
                next;
            }
        }

        if ( $_ =~ /xml:space="preserve">/ ) {
            $lineno = 0;
            #print "before[$_]\n";
            $_ =~ s/.*xml:space="preserve">//;
            #print "after[$_]\n";
        }
        $_ =~ s/&lt;/</g;
        $_ =~ s/&gt;/>/g;
        $_ =~ s/&quot;/"/g;
        $_ =~ s/&apos;/'/g;
        $_ =~ s/&amp;/&/g;
        chomp();
        $lineno++;

        #&remove_comment($_);

        #if ( $_ =~ /https?:\/\/web\.archive\.org\/2/ ) {
        #if ( $_ =~ /(<[^\>]+<)/ ) {
        #if ( $_ =~ /(<r[^\>]+<)/ ) {
        #if ( $_ =~ /<refname/ ) {
        #if ( $_ =~ /http:\/\/ameblo\.jp\/.*\/day/ ) {
        if ( $_ =~ /ディアゴスティーニ/ ) {
            print "* [[:$title]]\n";
            print "*: <nowiki>[$_]</nowiki>\n";
        }

        #$prevline = $_;

    }

    close DUMP;
}
getopts( "c", \%opts );

&main();



# EOF