Parser hook into MediaWiki using Python
Jump to navigation
Jump to search
import argparse, os, sys from mwclient import Site from urllib.parse import quote as urlquote import html5lib from xml.etree import ElementTree as ET THUMB_SIZE = 320 FULL_SIZE = 960 NS_CATEGORY = 14 p = argparse.ArgumentParser(description="Dump wiki files to html") p.add_argument("--host", metavar='', default="pzwiki.wdka.nl", help='wiki host') p.add_argument("--path", metavar='', default="/mw-mediadesign/", help="Wiki path. Should end with /") p.add_argument("--output", default="tasks", help="Output path for pages") p.add_argument("--category", default="Tasks of the Contingent Librarian", help="Category to query") args = p.parse_args() # print(args) # site and login def catmembers (c): prefix = c.get_prefix('cm', True) kwargs = dict(c.generate_kwargs(prefix, prop='ids|title', namespace=None, sort='sortkey', dir='asc', start=None, end=None, title=c.name, type="page")) return c.get_list(True)(c.site, 'categorymembers', 'cm', **kwargs) NS_MAIN = 0 NS_TALK = 1 NS_USER = 2 NS_USER_TALK = 3 def path4page(p): """ REturns the local path for a page """ ret = p.page_title if "/" in ret: ret = ret.split("/")[-1] ret = ret.replace(" ", "_") if p.namespace == NS_USER_TALK: ret = ret + "_rvrs" return ret + ".html" def href4page(p): p = path4path(p) ret = urlquote(p) def filenameforlink(href): """ todo: deal with namespaces? """ path = href if "/" in href: path = path.split("/")[-1] path = path+".html" return path def rewriteimagelink(a): href = a.attrib.get("href") path = href if "/" in href: path = path.split("/")[-1] print ("rewriteimagelink", path) r = site.api("query", prop="imageinfo", titles=path, iiprop="url", iiurlwidth=str(THUMB_SIZE), formatversion=2) iinfo = r['query']['pages'][0]['imageinfo'][0] thumburl = iinfo['thumburl'] #fullsizeurl = iinfo['url'] #filepageurl = iinfo['descriptionurl'] r = site.api("query", prop="imageinfo", titles=path, iiprop="url", iiurlwidth=str(FULL_SIZE), formatversion=2) iinfo = r['query']['pages'][0]['imageinfo'][0] fullsizeurl = iinfo['thumburl'] #fullsizeurl = iinfo['url'] #filepageurl = iinfo['descriptionurl'] a.attrib['href'] = fullsizeurl img = a.find("img") img.attrib['src'] = thumburl if "width" in img.attrib: del img.attrib["width"] if "height" in img.attrib: del img.attrib["height"] if "srcset" in img.attrib: del img.attrib["srcset"] print ("rewriteimagelink", thumburl, fullsizeurl) def rewritelinks(html): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) for a in t.findall(".//*[@href]"): linkclass = a.attrib.get("class", "") href = a.attrib.get("href") if "external" in linkclass: # leave external links alone continue # print ("LINK", href) if linkclass == "image": # link to presentation version of image # change img.src to a thumbnail rewriteimagelink(a) elif href.startswith("/mediadesign/"): new_href = filenameforlink(href) # print ("Rewriting link {} to {}".format(href, new_href), file=sys.stderr) a.attrib['href'] = new_href html = ET.tostring(t, method="html", encoding="unicode") return html site = Site(host=args.host, path=args.path) tasks=site.Categories[args.category] # FOR EVERY CARD for card in catmembers(tasks): # FRONT cardfilename = path4page(card) cardpath = os.path.join(args.output, cardfilename) print ("Saving page to {}".format(cardpath)) with open(cardpath, "w") as f: print ("""<!DOCTYPE html> <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>Tasks of the Contingent Librarian</title> <link rel="stylesheet" type="text/css" href="tasks.css"> <script src="tasks.js"></script> </head> <body> """, file=f) htmlsrc = site.parse(page=card.name)['text']['*'] htmlsrc = rewritelinks(htmlsrc) print ("""<div class="card">{}</div>""".format(htmlsrc), file=f) print (""" </body> </html>""", file=f) # BACK talk = site.pages["User_talk:"+card.page_title] if talk.exists: print ("OUTPUTTING CARD BACK {}".format(talk.page_title)) talkfilename = path4page(talk) talkpath = os.path.join(args.output, talkfilename) with open(talkpath, "w") as f: print ("""<!DOCTYPE html> <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>Tasks of the Contingent Librarian</title> <link rel="stylesheet" type="text/css" href="tasks.css"> <script src="tasks.js"></script> </head> <body> """, file=f) htmlsrc = site.parse(page=talk.name)['text']['*'] htmlsrc = rewritelinks(htmlsrc) print ("""<div class="cardback">{}</div>""".format(htmlsrc), file=f) print (""" </body> </html>""", file=f)