利用者:Bcxfubot/BOT作業依頼/log/20240517-3/prog

#!/usr/bin/python
# pylint: disable=consider-using-f-string
"""
[orig] first_archive.py
first_archive.pyは、新聞ニュース記事などで使う。最初のアーカイブが重要なやつ。
[http://www.blog.com/blog.html ページタイトル]
↓
{{Wayback|url=http://www.blog.com/blog.html |title=ページタイトル}}


一行に複数リンクがある場合に、最後のやつだけしか置換されない問題あり
→ 後日再度本スクリプトを動かして、最後以外のも置換させる必要あり。2020.2.13
"""

import re
import time
import urllib.parse
from urllib.parse import urlparse
import pywikibot
import requests
from requests.exceptions import Timeout

TARGET_RE = r"https:\/\/www\.toei\.co\.jp\/tv\/"
MAXCOUNT = 120
SLEEPSEC = 60

######################################################
# 処理モード
#PROCMODE = 0
PROCMODE = 1
######################################################

def get_domain(target):
    """ ドメインを取得する """
    url = ""
    result = re.search ( "(http[^ 　]+)", target)
    if result:
        url = result.group(1)
    else:
        return target
    parsed_uri = urlparse(url )
    result = '{uri.netloc}'.format(uri=parsed_uri)
    return result

def get_date_core(origurl):
    """ 日付取得処理core """
    encoded_url = urllib.parse.quote(origurl, safe="")
    print("encoded_url = "+ encoded_url)
    #spark_url = "https://web.archive.org/__wb/sparkline?url=" + encoded_url +
    # "&collection=web&output=json"
    api_url = "https://archive.org/wayback/available?url=" + encoded_url + "&timestamp=21010101"
    #print("spark_url = "+ spark_url)
    print("api_url = "+ api_url)

    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
    headers = {'User-Agent': user_agent}
    try:
        #response = requests.get(spark_url,timeout=60.0)
        response = requests.get(api_url,timeout=60.0,headers=headers)
    except Timeout:
        print("ERROR: timeout")
        raise
    except Exception as err:
        print("ERROR: Exception")
        print(err)
        raise
    print("response.text = " + response.text)
    #if "\"first_ts\":null" in response.text:
    #    return ""
    if "504 Gateway Time-out" in response.text:
        return ""

    data = response.json()
    print(data)
    #lastdate = data["last_ts"]
    #print(lastdate)
    #return lastdate
    #firstdate = data["first_ts"]
    try:
        firstdate = data["archived_snapshots"]["closest"]["timestamp"]
    except TypeError:
        firstdate = ""
    except KeyError:
        firstdate = ""
    print(firstdate)
    return firstdate

def get_date(origurl):
    """ 日付取得処理 """
    result = ""
    for i in range(2):
        result = get_date_core(origurl)
        if result != "":
            break
        print("sleep(10), i=" + str(i))
        time.sleep(10)

    return result


def get_stat( url ):
    """ status取得処理 """
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
    headers = {'User-Agent': user_agent}
    try:
        if ".pdf" in url:
            response = requests.head(url,headers=headers,timeout=5.0)
        else:
            response = requests.get(url,headers=headers,timeout=5.0)
    except requests.exceptions.ConnectionError:
        print("ERROR: ConnectionError")
        return 404
    except Timeout:
        print("ERROR: timeout")
        raise
    except Exception as err:
        print("ERROR: Exception")
        print(err)
        raise
    print(response.status_code)
    print(response.url)
    return response.status_code

def make_newline( origline ):
    """ 新行を作成する """
    #print("make_newline(): origline=" + origline)
    newline = ""
    #result = re.search( "^(.*)\[(http:\/\/dir\.yahoo\.co\.jp\/[^ ]+) ([^\]]*)\](.*)$" , origline )
    #result = re.search( TARGET_RE , origline )
    result = re.search( r"^(.*)\[(" + TARGET_RE + r"[^ 　]*)[ 　]+([^\]]*)\](.*)$", origline)
    if result:
        pre = result.group(1)
        print("pre="+pre)
        origurl = result.group(2)

        origtext = result.group(3)
        post = result.group(4)
        print("origurl = " + origurl)
        print("origtext = " + origtext)

        # origtextに[[]]内部リンクがある場合はうまく処理できないのでここではじく2020.7.24
        result2 = re.search( r"\[\[", origtext)
        if result2:
            return origline

        #stat = get_stat( origurl )
        #if stat not in ( 400, 404, 405, 410):
        #    return origline

        date = get_date( origurl )
        #if date == "":
        #    return ""
        if date == "":
            #date = "*"
            # アーカイブがない場合は修正しないことにした。2021.10.15
            return origline



        print("date = " + date)
        origtext = origtext.replace("|", "&#124;")
        newline = pre + "{{Wayback|url=" + origurl + \
                " |title=" + origtext + \
                " |date=" + date + "}}" + post
        print("newline = " + newline)
    return newline

def replace_page(site,pagetitle):
    """ ページを更新する """
    is_saved = False
    page = pywikibot.Page(site, pagetitle)
    #text = page.text
    #print(text)
    linelist = page.text.split('\n')
    #print(linelist)

    gaibu = 0
    modflag = 0
    outtext = ""
    for line in linelist:
        if (re.search("<ref",line) or
            re.search("ref>",line) or
            re.search("web.archive.org",line) or
            re.search("Wayback",line) ):
            outtext += line + "\n"
            continue
        #print(gaibu,line)
        result = re.search( r"^\*", line)
        if result:
            #if target in line:
            if re.search( TARGET_RE, line):
                newline = make_newline( line )
                if newline != "":
                    if line != newline:
                        line = newline
                        print(gaibu,line)
                        modflag = 1
        outtext += line + "\n"

    # 最終行の改行の調整
    if page.text[-1:] != "\n":
        if outtext[-1:] == "\n":
            outtext = outtext[:-1]

    if modflag == 1:
        page.text = outtext
        if PROCMODE == 1:
            #page.save("外部リンクの修正 http:// -> https:// ([[Wikipedia:Bot|Bot]]による編集)")
            #page.save("外部リンクの修正 http:// -> {{Wayback}} ([[Wikipedia:Bot|Bot]]による編集)")
            page.save("外部リンクの修正 http:// -> {{Wayback}} (" +
                    get_domain( TARGET_RE.replace("\\","") ) +
                    ") [[Wikipedia:Bot作業依頼#東映のリンク切れをウェイバックマシンに置換]] ([[Wikipedia:Bot|Bot]]による編集)")
            is_saved = True
    return is_saved

# 処理対象のページ名をひとつ返す
# 処理対象がない場合は""を返す
def get_pagetitle():
    """ 処理対象のページ名をひとつ返す """
    path = "list"
    with open(path, encoding="utf-8") as file:
        for s_line in file:
            s_line = s_line.rstrip("\n")
            #print(s_line)
            #if not re.search(",sumi", s_line):
            if not s_line.endswith(",sumi"):
                return s_line
    return ""

# 処理した行にsumiをつける
def done_pagetitle(pagetitle):
    """ 処理した行にsumiをつける """
    path = "list"
    alltext = ""
    with open(path, encoding="utf-8") as file:
        for s_line in file:
            s_line = s_line.rstrip("\n")
            #print(s_line + "\n")
            #if re.search(pagetitle, s_line):
            if pagetitle == s_line:
                s_line = s_line + ",sumi"
            alltext += s_line + "\n"
    with open(path, mode='w', encoding="utf-8") as file:
        file.write(alltext)
    return ""

def sub():
    """ sub """
    site = pywikibot.Site()
    site.login()
    for i in range(MAXCOUNT):
        pagetitle = get_pagetitle()
        print("[" + str(i + 1) + "/" + str(MAXCOUNT) + "]" + ":" + "pagetitle=" + pagetitle)
        if pagetitle == "":
            break
        is_saved = replace_page(site,pagetitle)
        done_pagetitle(pagetitle)

        #if ( i < (MAXCOUNT - 1) ):
        #    print("sleep(" + str(SLEEPSEC) + ")")
        #    time.sleep(SLEEPSEC)
        print("is_saved=" + str(is_saved))
        if is_saved:
            print("sleep(" + str(SLEEPSEC) + ")")
            time.sleep(SLEEPSEC)
        else:
            print("sleep(" + str(5) + ")")
            time.sleep(5)

def main():
    """ main """
    sub()
    print("done.")

if __name__ == '__main__':
    main()