利用者:Bcxfubot/BOT作業依頼/log/20240517-2/prog
表示
#!/usr/bin/python
# pylint: disable=consider-using-f-string
"""
[orig] honbun_archive.py
URL張替
[http://www.blog.com/blog.html ページタイトル]
↓
[https://web.archive.org/20210503125012/http://www.blog.com/blog.html ページタイトル]
一行に複数リンクがある場合に、最後のやつだけしか置換されない問題あり
→ 後日再度本スクリプトを動かして、最後以外のも置換させる必要あり。2020.2.13
"""
import re
from urllib.parse import urlparse
import time
import urllib.parse
import pywikibot
import requests
from requests.exceptions import Timeout
TARGET_RE = r"https:\/\/www\.toei\.co\.jp\/release\/tv\/"
MAXCOUNT = 120
SLEEPSEC = 60
######################################################
# 処理モード
#PROCMODE = 0
PROCMODE = 1
######################################################
#def get_domain(url):
# parsed_uri = urlparse(url )
# result = '{uri.netloc}'.format(uri=parsed_uri)
# return result
def get_domain(target):
""" ドメインを取得する """
url = ""
result = re.search ( "(http[^ ]+)", target)
if result:
url = result.group(1)
else:
return target
parsed_uri = urlparse(url )
result = '{uri.netloc}'.format(uri=parsed_uri)
return result
def get_date_core(origurl):
""" 日付取得処理core """
encoded_url = urllib.parse.quote(origurl, safe="")
print("encoded_url = "+ encoded_url)
#spark_url = "https://web.archive.org/__wb/sparkline?url=" + encoded_url + \
# "&collection=web&output=json"
#api_url = "https://archive.org/wayback/available?url=" + encoded_url + "×tamp=20010101"
api_url = "https://archive.org/wayback/available?url=" + encoded_url + "×tamp=21010101"
print("api_url = "+ api_url)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
headers = {'User-Agent': user_agent}
try:
#response = requests.get(spark_url,timeout=60.0)
response = requests.get(api_url,timeout=60.0,headers=headers)
except Timeout:
print("ERROR: timeout")
raise
except Exception as err:
print("ERROR: Exception")
print(err)
raise
print("response.text = " + response.text)
if "504 Gateway Time-out" in response.text:
return ""
data = response.json()
print(data)
#date = data["last_ts"]
#date = data["first_ts"]
try:
date = data["archived_snapshots"]["closest"]["timestamp"]
except TypeError:
date = ""
except KeyError:
date = ""
print(date)
return date
def get_date(origurl):
""" 日付取得処理 """
result = ""
for i in range(2):
result = get_date_core(origurl)
if result != "":
break
print("sleep(10), i=" + str(i))
time.sleep(10)
return result
def get_stat( url ):
""" status取得処理 """
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
headers = {'User-Agent': user_agent}
try:
if ".pdf" in url:
response = requests.head(url,headers=headers,timeout=5.0)
else:
response = requests.get(url,headers=headers,timeout=5.0)
except requests.exceptions.ConnectionError:
print("ERROR: ConnectionError")
return 404
except Timeout:
print("ERROR: timeout")
raise
except Exception as err:
print("ERROR: Exception")
print(err)
raise
print(response.status_code)
print(response.url)
return response.status_code
def is_domain_webarchive( origurl, origline):
""" urlのドメインが同じ行にweb.archive.orgとして存在している場合は1を返す。 """
domain = ""
result = re.search( r"https?:\/\/([0-9\.a-zA-Z_-]+)\/", origurl)
if result:
domain = result.group(1)
print("domain="+domain)
regexp = r"https?:\/\/" + domain
print("regexp="+regexp)
#regexp = regexp.replace("\/","\\\/")
#regexp = regexp.replace("\.","\\\.")
pattern = r"https:\/\/web\.archive\.org\/web\/[0-9a-z_]+\/" + regexp
print("pattern="+pattern)
if re.search( pattern, origline):
return True
return False
def make_newline( origline ):
""" 新行を作成する """
newline = ""
#result = re.search( "^(.*)\[(http:\/\/dir\.yahoo\.co\.jp\/[^ ]+) ([^\]]*)\](.*)$" , origline )
#result = re.search( TARGET_RE , origline )
#result = re.search( "^(.*)\[(" + TARGET_RE + "[^ ]+) ([^\]]*)\](.*)$", origline)
print("origline="+origline)
#result = re.search( "(" + TARGET_RE + "[^ \]\}\|]+)", origline)
#result = re.search( "\[(" + TARGET_RE + "[^ \]\}\|]+)", origline)
#result = re.search( "\[(" + TARGET_RE + "[^ \]\}\|]*)", origline)
#result = re.search( "(" + TARGET_RE + "[^ \]\}\|]*)", origline)
#result = re.search( "(" + TARGET_RE + "[^ \]\}\|\<]*)", origline)
#matchedlist = re.findall( "(" + TARGET_RE + "[^ \]\}\|\<]*)", origline)
#matchedlist = re.findall( "[^\/]" + TARGET_RE + "[^ \]\}\|\<]*", origline)
pattern = r"https?://[^ \t\|\]\}<\)]+"
matchedlist = re.findall( pattern, origline)
newline = origline
if matchedlist:
for url in matchedlist:
result = re.search( r"(http.*[^\-])-->$", url)
if result:
url = result.group(1)
origurl = url
print("origurl = " + origurl)
result = re.search( "web.archive.org", origurl)
if result:
print( "ERR: This is archive.org. pass")
continue
if is_domain_webarchive( origurl, origline):
print( "ERR: is_domain_webarchive() is True. pass")
continue
result = re.search( TARGET_RE, origurl)
if not result:
continue
#stat = get_stat( origurl )
#if stat not in (400, 404, 405, 410):
# continue
date = get_date( origurl )
if date == "":
#return ""
continue
print("date = " + date)
ardate = date
newurl = "https://web.archive.org/web/" + ardate + "/" + origurl
#newline = origline.replace( origurl, newurl)
#newline = newline.replace( origurl, newurl)
# →{{Cite web|url=AAA|archiveurl=AAA}}
# の場合に、archiveurlの方を置換しないように1回の置換にとどめる。2020.11.19
newline = newline.replace( origurl, newurl, 1)
print("newline = " + newline)
# archive.orgの二重書きしていないことをチェック2020.7.8
result = re.search( r"https:\/\/web\.archive\.org\/(web\/)?[0-9]+\/"\
r"https:\/\/web\.archive\.org\/", newline)
if result:
print("ERROR: web.archive.org 二重書き")
raise Exception
return newline
def mk_comment():
""" コメントを作成する """
if "http:" in TARGET_RE:
com1 = "http:// -> web.archive.org"
else:
com1 = "https:// -> web.archive.org"
com2 = "(" + get_domain( TARGET_RE.replace("\\","") ) + ")"
com3 = "[[Wikipedia:Bot作業依頼#東映のリンク切れをウェイバックマシンに置換]] ([[Wikipedia:Bot|Bot]]による編集)"
comment = "外部リンクの修正 " + com1 + " " + com2 + " " + com3
return comment
def replace_page(site,pagetitle):
""" ページを更新する """
is_saved = False
page = pywikibot.Page(site, pagetitle)
#text = page.text
#print(text)
linelist = page.text.split('\n')
#print(linelist)
comment = ""
gaibu = 0
modflag = 0
outtext = ""
for line in linelist:
## archiveurlの行は二重書き換えが発生しやすいので除外する 2020.11.19
#tmp_re = TARGET_RE + ".*" + TARGET_RE
if ( re.search(r"[Ww]ayback",line) or
#re.search("archiveurl", line) or
#re.search(tmp_re, line) or
re.search(r"[Aa]rchive\.(is|ph|li|fo|vn|md|today)", line) or
re.search(r"[wW]eb[aA]rchive", line) ):
outtext += line + "\n"
continue
#print(gaibu,line)
if re.search(TARGET_RE,line):
newline = make_newline( line )
if newline != "":
if line != newline:
line = newline
comment = newline
print(gaibu,line)
modflag = 1
outtext += line + "\n"
# 最終行の改行の調整
if page.text[-1:] != "\n":
if outtext[-1:] == "\n":
outtext = outtext[:-1]
difflen = len(outtext) - len(page.text)
print("difflen=" + str(difflen))
if ( ( difflen < -30 ) or
( difflen > 2600 ) ):
raise Exception
if modflag == 1:
page.text = outtext
if PROCMODE == 1:
comment = mk_comment()
page.save(comment)
is_saved = True
return is_saved
# 処理対象のページ名をひとつ返す
# 処理対象がない場合は""を返す
def get_pagetitle():
""" 処理対象のページ名をひとつ返す """
path = "list"
with open(path, encoding="utf-8") as file:
for s_line in file:
s_line = s_line.rstrip("\n")
#print(s_line)
#if not re.search(",sumi", s_line):
if not s_line.endswith(",sumi"):
return s_line
return ""
# 処理した行にsumiをつける
def done_pagetitle(pagetitle):
""" 処理した行にsumiをつける """
path = "list"
alltext = ""
with open(path, encoding="utf-8") as file:
for s_line in file:
s_line = s_line.rstrip("\n")
#print(s_line + "\n")
#if re.search(pagetitle, s_line):
if pagetitle == s_line:
s_line = s_line + ",sumi"
alltext += s_line + "\n"
with open(path, mode='w', encoding="utf-8") as file:
file.write(alltext)
return ""
def sub():
""" sub """
site = pywikibot.Site()
site.login()
for i in range(MAXCOUNT):
pagetitle = get_pagetitle()
print("[" + str(i + 1) + "/" + str(MAXCOUNT) + "]" + ":" + "pagetitle=" + pagetitle)
if pagetitle == "":
break
is_saved = replace_page(site,pagetitle)
done_pagetitle(pagetitle)
#if ( i < (MAXCOUNT - 1) ):
# print("sleep(" + str(SLEEPSEC) + ")")
# time.sleep(SLEEPSEC)
print("is_saved=" + str(is_saved))
if is_saved:
print("sleep(" + str(SLEEPSEC) + ")")
time.sleep(SLEEPSEC)
else:
print("sleep(" + str(5) + ")")
time.sleep(5)
def main():
""" main """
sub()
print("done.")
if __name__ == '__main__':
main()