I've migrated my company's Perspective wiki to dokuwiki. The Perspective wiki had dozens of collections and editors and attachments and hundreds of pages, and the importing script worked reasonably well. Perspective is a Windows hosted wiki, so this is a Windows Python script. (The path separators are hardcoded.)
ToDo: This script creates an “imported.log” file with the same format as “changes.log” But it can't be merged into changes.log as is. Andreas points out that, “The ID and the lastmod timestamp needs to match. So first create the file then get it's timestamp and save that one to the changelog.”
Requirements:
# ConvertPerspectiveToDokuwiki
# By: David Blume
#
# This was a quick hack, but it works reasonably well.
import sys
import os
import time
from xml.dom import minidom
doku_dir = ""
def CopyAttachments(attachments, path, doku_page, doku_namespace):
for attachment in attachments.childNodes:
if attachment.nodeName == u'attachment':
version = attachment.getElementsByTagName("version")[0].firstChild.data
name = attachment.getElementsByTagName("name")[0].firstChild.data
doku_name = name.lower().replace(" ", "_")
doku_media_path = doku_dir + "\\media" + doku_namespace
if not os.path.exists(doku_media_path):
os.mkdir(doku_media_path)
os.popen4("copy \"" + path + "\\" + version + "-attachments\\" + name + "\" \"" + doku_media_path + "\\" + doku_name + "\"")
if len(doku_namespace):
name = doku_namespace[1:] + ":" + name
if name not in attached_images:
doku_page.write("\nAutomatically Attached : {{" + name + "}}\n")
doku_list_types = []
attached_images = []
changes_log = None
def ParsePage(node, doku_page, doku_namespace, doku_name):
global doku_list_types
global attached_images
if node.nodeType == minidom.Node.TEXT_NODE:
s = node.nodeValue.lstrip()
if s:
s = s.replace("&", "&")
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace(u'\u201c', "\"")
s = s.replace(u'\u201d', "\"")
s = s.replace(u'\xb4', "'")
s = s.replace(u'\u2019', "'")
s = s.replace(u'\u2013', "-")
s = s.replace(u'\u2022', ".")
s = s.replace(u'\u2026', "...")
s = s.replace(u'\u2018', "'")
s = s.replace(u'\xb7', "*")
s = s.replace("**", "<nowiki>**</nowiki>")
doku_page.write(s)
return
node_name = node.nodeName
if node_name == u'img':
src = node.getAttribute("src")
src_name = src[src.find("name=") + 5:]
if len(doku_namespace):
src_name = doku_namespace[1:] + ":" + src_name
doku_page.write("{{" + src_name + "}}")
attached_images.append(src_name)
return
if node_name == u'link':
link_dest = node.getElementsByTagName("name")[0].firstChild.data
if link_dest[0] == ':' and len(doku_namespace):
link_dest = link_dest[1:]
doku_page.write("[[" + link_dest + "|" + node.getElementsByTagName("anchor")[0].firstChild.data + "]]")
return
doku_list_pop = False
bold = False
italics = False
underline = False
anchor = False
preformatted = False
line_item = False
header = ""
if node_name == u'a':
doku_page.write("[[" + node.getAttribute("href") + "|")
anchor = True
if node_name == u'ol':
doku_list_types += "o"
doku_list_pop = True
if node_name == u'ul':
doku_list_types += "u"
doku_list_pop = True
if node_name == u'li':
doku_page.write(" " * len(doku_list_types))
doku_page.write(doku_list_types[-1] == 'u' and "* " or "- ")
line_item = True
# What about tables? How are they done?
if node_name == u'h1':
header = "======"
doku_page.write(header + " ")
if node_name == u'h2':
header = "====="
doku_page.write(header + " ")
if node_name == u'h3':
header = "===="
doku_page.write(header + " ")
if node_name == u'h4':
header = "==="
doku_page.write(header + " ")
if node_name == u'h5':
header = "=="
doku_page.write(header + " ")
if node_name == u'span':
style = node.getAttribute("style")
if style:
if style.find("font-weight:bold;") != -1:
doku_page.write("**")
bold = True
if style.find("text-decoration:underline;") != -1:
doku_page.write("__")
underline = True
if style.find("font-style:italic;") != -1:
doku_page.write("//")
italics = True
if node_name == u'div':
style = node.getAttribute("style")
if style:
if style.find("margin-left:40px;") != -1:
# Maybe one day we'll support indentation
pass
if node_name == u'pre':
doku_page.write("''")
preformatted = True
for subnode in node.childNodes:
ParsePage(subnode, doku_page, doku_namespace, doku_name)
if preformatted:
doku_page.write("''")
if italics:
doku_page.write("//")
if underline:
doku_page.write("__")
if bold:
doku_page.write("**")
if doku_list_pop:
doku_list_types.pop()
doku_page.write("\n")
if len(header):
doku_page.write(" " + header + "\n")
if anchor:
doku_page.write("]]")
if line_item:
doku_page.write("\n")
if node_name == u'p' and len(doku_list_types) == 0:
doku_page.write("\n\n")
if (node_name == u'br' or node_name == u'div') and len(doku_list_types) == 0:
doku_page.write("\\\\ \n")
def GetNode(nodes, attribute, name):
for n in nodes:
if n.getAttribute(attribute) == name:
return n
def Walk(path):
global changes_log
global attached_images
for filename in os.listdir(path):
fullpath = path + "\\" + filename
if os.path.isdir(fullpath):
if filename.endswith(".page"):
# Parse this page
# Get the revision from "latest.txt"
print "Parsing " + fullpath + "..."
version = -1
try:
version = file(fullpath + "\\latest.txt", "r").read().strip()
except:
print "WARNING: " + filename + " does not have a latest version."
continue
source = minidom.parse(fullpath + "\\versions\\" + version + ".xml")
assert source.documentElement.tagName == "page-data"
fields = source.documentElement.getElementsByTagName("field")
name = GetNode(fields, "name", "page.name").firstChild.data
doku_namespace = ""
doku_name = ""
if name.find(":") != -1:
doku_namespace, doku_name = name.split(":")
doku_namespace = "\\" + doku_namespace
else:
doku_name = name
doku_namespace = doku_namespace.lower().replace(" ", "_")
doku_name = doku_name.lower().replace(" ", "_")
if not os.path.isdir(doku_dir + "\\pages" + doku_namespace):
os.mkdir(doku_dir + "\\pages" + doku_namespace)
doku_page = file(doku_dir + "\\pages" + doku_namespace + "\\" + doku_name + ".txt", 'w')
user = GetNode(fields, "name", "page.last-edit-username").firstChild.data.lower()
date = GetNode(fields, "name", "page.last-edit-server-time").firstChild.data
date = str(int(time.mktime(time.strptime(date, '%d/%b/%y %H:%M:%S'))))
page = GetNode(fields, "name", "page.contents")
attached_images = []
ParsePage(page, doku_page, doku_namespace, doku_name)
attachments = GetNode(fields, "name", "page.attachments")
if attachments:
CopyAttachments(attachments, fullpath + "\\versions", doku_page, doku_namespace)
doku_page.close()
source.unlink()
changes_log.write("\t".join([date, "127.0.0.1", doku_namespace[1:] + ":" + doku_name, user[user.find(":")+1:], "imported"]) + '\n')
else:
Walk(fullpath)
def main(args):
source_dir = ""
global doku_dir
global changes_log
if len(args) == 2:
source_dir = args[0]
doku_dir = args[1]
doku_dir += "\\data"
changes_log = file(doku_dir + "\\imported.log", 'wb')
Walk(source_dir)
changes_log.close()
print "Finished parsing " + source_dir
if __name__=='__main__':
if len(sys.argv) == 3:
main(sys.argv[1:])
else:
print "usage: " + sys.argv[0] + " perspective_dir dokuwiki_dir"
print "Where the perspective_dir contains the .col (collection) directories, and"
print "the dokuwiki_dir contains the dokuwiki's data directory as a child directory."