Gebruiker:DajasjBot/referentieachterpunt.py

from bz2file import BZ2File
from lxml import etree
import os
import pywikibot
import time
import re
from datetime import datetime
site = pywikibot.Site()
start = datetime.now()

# Manually download file
with BZ2File("nlwiki-latest-pages-articles.xml.bz2") as xml_file:
    file = etree.iterparse(
        xml_file, tag='{http://www.mediawiki.org/xml/export-0.10/}page')
    for _, dump_page in file:

        # Get the text
        dump_text = dump_page.findtext(
            '{http://www.mediawiki.org/xml/export-0.10/}revision/{http://www.mediawiki.org/xml/export-0.10/}text'
        )
        # Get the title
        title = dump_page.findtext(
            '{http://www.mediawiki.org/xml/export-0.10/}title')
        punct = "\.\,\!\?\;"
        # Check if violation on dump version
        if re.search("<ref[^<>]*?>[^<>]*?</ref>[%s]" % punct,
                     dump_text) or re.search("\s<ref[^<>]*?>", dump_text):

            # Get page from the live version
            page = pywikibot.Page(site, title)
            live_text = page.text

            # Check protection
            if "edit" in page.protection().keys():
                if page.protection()["edit"][0] == "sysop":
                    continue

            # Check namespace
            if page.namespace().id != 0:
                continue
                
            # Keep track of changes
            changed = False

            # Remove preceding whitespaces
            while re.search("\s<ref[^<>]*?>", live_text):
                print("b")
                changed = True
                live_text = re.sub(r"(\s)(<ref[^<>]*?>)", r"\2", live_text)

            # Move or delete punctuation
            while re.search("<ref[^<>]*?>[^<>]*?</ref>[%s]" % punct,
                            live_text):
                changed = True
                live_text = re.sub(
                    r"([^%s])(<ref[^<>]*?>[^<>]*?</ref>)([%s])" %
                    (punct, punct), r"\1\3\2", live_text)
                live_text = re.sub(
                    r"([%s])(<ref[^<>]*?>[^<>]*?</ref>)([%s])" %
                    (punct, punct), r"\1\2", live_text)

            if changed:
                # Upload changes
                page.text = live_text
                page.save(u"Referenties na leesteken geplaatst")
                
                # Pause for rate limit
                time.sleep(max(0, 60 - (datetime.now() - start).seconds))
                start = datetime.now()