Gebruiker:DajasjBot/archivelinks.py

from bz2file import BZ2File
from lxml import etree
import os
import waybackpy
import pywikibot
import time
import re
from tqdm import notebook
from datetime import datetime
site = pywikibot.Site()
start = datetime.now()

user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"


def citetemplate_to_dict(citetemplate):
    splitted_citetemplate = citetemplate[:-2].split("|")[1:]
    dict_of_citetemplate = {}
    for n, x in enumerate(splitted_citetemplate):

        if "=" in x:
            x_split = x.split("=", 1)
            dict_of_citetemplate[x_split[0]] = x_split[1]
        elif "]]" in x:
            prev_x_split = splitted_citetemplate[n - 1].split("=", 1)
            dict_of_citetemplate[prev_x_split[0]] += "|" + x

    return dict_of_citetemplate


datedict = {
    "januari": 1,
    "februari": 2,
    "maart": 3,
    "april": 4,
    "mei": 5,
    "juni": 6,
    "juli": 7,
    "augustus": 8,
    "september": 9,
    "oktober": 10,
    "november": 11,
    "december": 12
}

# Manually download file
with BZ2File("nlwiki-latest-pages-articles.xml.bz2") as xml_file:
    file = etree.iterparse(
        xml_file, tag='{http://www.mediawiki.org/xml/export-0.10/}page')
    for _, dump_page in notebook.tqdm(file):

        # Get the text
        dump_text = dump_page.findtext(
            '{http://www.mediawiki.org/xml/export-0.10/}revision/{http://www.mediawiki.org/xml/export-0.10/}text'
        )
        # Get the title
        title = dump_page.findtext(
            '{http://www.mediawiki.org/xml/export-0.10/}title')

        # Check if dump has missing archive urls
        list_citetemplates_dump = [
            x for x in re.findall("{{[Cc]iteer web[^{}]*?}}", dump_text)
            if not re.search(
                r"archief-{0,1}url\s*?=\s*?[^\|\s]{10,}|archive-{0,1}url\s*?=\s*?[^\|\s]{10,}",
                x, re.IGNORECASE)
        ]
        if len(list_citetemplates_dump) == 0:
            continue

        # REMOVE
        title = "Gebruiker:DajasjBot/Kladblok"

        # Get page from the live version
        page = pywikibot.Page(site, title)
        live_text = page.text

        # Check protection
        if "edit" in page.protection().keys():
            if page.protection()["edit"][0] == "sysop":
                continue

        # UNCOMMENT
#         # Check namespace
#         if page.namespace().id != 0:
#             continue

        list_citetemplates_live = [
            x for x in re.findall("{{[Cc]iteer web[^{}]*?}}", live_text)
            if not re.search(
                r"archief-{0,1}url\s*?=\s*?[^\|\s]{10,}|archive-{0,1}url\s*?=\s*?[^\|\s]{10,}",
                x, re.IGNORECASE)
        ]
        # Check if also misses archiveurls
        if len(list_citetemplates_live) == 0:
            continue

        changed = False

        for citetemplate in list_citetemplates_live:
            dict_of_citetemplate = citetemplate_to_dict(citetemplate)
            dict_of_citetemplate = {
                key.strip(): value.strip()
                for key, value in dict_of_citetemplate.items()
            }

            # Get url
            if 'url' in dict_of_citetemplate:
                url = dict_of_citetemplate['url']
            elif "URL" in dict_of_citetemplate:
                url = dict_of_citetemplate['URL']
            else:
                continue

            # Get retrieved date
            if 'bezochtdatum' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['bezochtdatum']
            elif 'accessdate' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['accessdate']
            elif 'datumbezocht' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['datumbezocht']
            elif 'datumgeraadpleegd' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['datumgeraadpleegd']
            elif 'raadpleegdatum' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['raadpleegdatum']
            elif 'access-date      ' in dict_of_citetemplate:
                retrievedate = dict_of_citetemplate['access-date']
            else:
                retrievedate = ""

            wayback = waybackpy.Url(url, user_agent)

            # Check if already archived
            if wayback.total_archives() == 0:
                # Archive page
                try:
                    archive = wayback.save()
                except Exception as e:
                    print(e)
                    print(url)
                    continue
            else:
                # Get archived page
                retrievedate = retrievedate.strip()
                if retrievedate != "":
                    if re.search(r"\d{4}-\d{2}-\d{2}", retrievedate):
                        year = retrievedate[:4]
                        month = retrievedate[5:7]
                        day = retrievedate[8:10]
                    elif re.search(r"\d{1,2}-\d{1,2}-\d{4}", retrievedate):
                        year = retrievedate.split("-")[-1]
                        month = retrievedate.split("-")[1]
                        day = retrievedate.split("-")[0]

                    elif re.search(r"(\d{1,2})\s([a-zA-Z]{3,10})\s\d{4}",
                                   retrievedate, re.IGNORECASE):
                        month = re.search(
                            r"(\d{1,2})\s([a-zA-Z]{3,10})\s\d{4}",
                            "25 augustus 2021").group(2).lower()
                        if month in datedict.keys():
                            month = datedict[month]
                        else:
                            continue
                        day = re.search(r"(\d{1,2})\s([a-zA-Z]{3,10})\s\d{4}",
                                        "25 augustus 2021").group(1)
                        year = retrievedate[-4:]

                    else:
                        print("date is wrong")
                        print(url)
                        print(retrievedate)
                        continue

                    try:
                        archive = wayback.near(year=year, month=month, day=day)
                    except Exception as e:
                        print(url)
                        print(e)
                        continue
                else:
                    archive = wayback.newest()

            # Create new template
            new_dict_of_citetemplate = citetemplate_to_dict(citetemplate)

            # Remove archive related parameters
            new_dict_of_citetemplate = {
                key: value
                for key, value in new_dict_of_citetemplate.items()
                if key.strip() not in [
                    "archiefdatum", "archivedate", "archive-date", "dodeurl",
                    "dode-url", "deadurl", "dead-url", "archiefurl",
                    "archiveurl", "archive-url"
                ]
            }
            new_dict_of_citetemplate['archiefurl'] = archive.archive_url
            new_dict_of_citetemplate[
                'archiefdatum'] = archive.timestamp.strftime("%Y-%m-%d")
            new_dict_of_citetemplate['dodeurl'] = "nee"
            new_citetemplate = citetemplate.split("|")[0] + "|" + "|".join([
                key + "=" + value
                for key, value in new_dict_of_citetemplate.items()
            ]) + "}}"

            # Add to live_text
            live_text = live_text.replace(citetemplate, new_citetemplate)
            changed = True
        if changed:
            page.text = live_text
            page.save(u"Archiefurl toegevoegd")

            # Pause for rate limit
            time.sleep(max(0, 60 - (datetime.now() - start).seconds))
            start = datetime.now()

        break
        dump_page.clear()

        # Also eliminate now-empty references from the root node to elem
        for ancestor in dump_page.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]