from bz2file import BZ2File
from lxml import etree
import os
import pywikibot
import time
import re
from datetime import datetime
site = pywikibot.Site()
start = datetime.now()
# Manually download file
with BZ2File("nlwiki-latest-pages-articles.xml.bz2") as xml_file:
file = etree.iterparse(
xml_file, tag='{http://www.mediawiki.org/xml/export-0.10/}page')
for _, dump_page in file:
# Get the text
dump_text = dump_page.findtext(
'{http://www.mediawiki.org/xml/export-0.10/}revision/{http://www.mediawiki.org/xml/export-0.10/}text'
)
# Get the title
title = dump_page.findtext(
'{http://www.mediawiki.org/xml/export-0.10/}title')
punct = "\.\,\!\?\;"
# Check if violation on dump version
if re.search("<ref[^<>]*?>[^<>]*?</ref>[%s]" % punct,
dump_text) or re.search("\s<ref[^<>]*?>", dump_text):
# Get page from the live version
page = pywikibot.Page(site, title)
live_text = page.text
# Check protection
if "edit" in page.protection().keys():
if page.protection()["edit"][0] == "sysop":
continue
# Check namespace
if page.namespace().id != 0:
continue
# Keep track of changes
changed = False
# Remove preceding whitespaces
while re.search("\s<ref[^<>]*?>", live_text):
print("b")
changed = True
live_text = re.sub(r"(\s)(<ref[^<>]*?>)", r"\2", live_text)
# Move or delete punctuation
while re.search("<ref[^<>]*?>[^<>]*?</ref>[%s]" % punct,
live_text):
changed = True
live_text = re.sub(
r"([^%s])(<ref[^<>]*?>[^<>]*?</ref>)([%s])" %
(punct, punct), r"\1\3\2", live_text)
live_text = re.sub(
r"([%s])(<ref[^<>]*?>[^<>]*?</ref>)([%s])" %
(punct, punct), r"\1\2", live_text)
if changed:
# Upload changes
page.text = live_text
page.save(u"Referenties na leesteken geplaatst")
# Pause for rate limit
time.sleep(max(0, 60 - (datetime.now() - start).seconds))
start = datetime.now()