'''
Wikicreole style parser. Mostly complient.
Written 2010-12-19. danomagnum.com
'''
__version__ = 1.0
import re
import math
USE_TOC = True #should we even make TOCs?
TOC_LENGTH = 1000 #Length an article needs to be before we worry about TOCing it.
def tocmaker(contents):
'''Creates a toc from a list of ids'''
ret = ''
if len(contents) > 3:
ret = "
"
current_level = min([level for text, level in contents])
for text, level in contents:
delta = level - current_level
if delta > 0:
for l in range(delta):
ret += ''
elif delta < 0:
for l in range(-delta):
ret += '
'
current_level = level
ret += "- " + text + "
"
ret += '
'
return ret
def parse(text):
'''Parses text and returns html according to (almost) wikicreole
\'\'italic\'\' \'\'\'bold\'\'\' __underline__
[[link]] [[link|linktext]] {{image}} {{image|imagetext}}
*list
**list2
#numbered list
##numbered list 2
==Heading 1
===Heading 2
\\\\ ->
---- ->
'''
in_pre = False
in_ulist = 0
in_olist = 0
in_table = False
formattings = [['**','b',False],['//','i',False],['__','u',False]]
outstring = ''
contents = []
headings = [('=====','h5', 5),('====','h4', 4),('===','h3', 3),('==','h2', 2),('=','h1', 1)]
re_image_1 = re.compile(r'\{\{([^\|^\}]+?)\|(.+?)\}\}')
re_image_1s = r'
'
re_image_2 = re.compile(r'\{\{([^\|^\}]+?)\}\}')
re_image_2s = r'
'
re_outlink_1 = re.compile(r'\[\[([^\|^\]]+?)\|(.+?)\]\]')
re_outlink_1s = r'\2'
re_outlink_2 = re.compile(r'\[\[([^\|^\]]+?)\]\]')
re_outlink_2s = r'\1'
re_pre_inline = re.compile(r"\{\{\{(.+?)\}\}\}")
re_pre_inline_s = r'\1'
for line in text.split('\n'):
if in_pre:
if line.startswith('}}}'):
outstring += ''
in_pre = False
else:
outstring += line + "\n"
continue
line = line.lstrip()
if line.startswith('#'): #check for ordered lists
if not in_olist:
outstring += ""
in_olist = 1
else:
level = len(line[:in_olist+1].split('#')) - 1
delta = math.fabs(level - in_olist)
while delta:
level = len(line[:in_olist+1].split('#')) - 1
if level > in_olist:
outstring += ""
in_olist += 1
else:
outstring += "
"
in_olist -= 1
delta = math.fabs(level - in_olist)
line = "- " + line[in_olist:] + "
"
elif in_olist:
while in_olist:
outstring += "
"
in_olist -= 1
if line.startswith('*'): #check for unordered lists
if not in_ulist:
outstring += ""
in_ulist = 1
else:
level = len(line[:in_ulist+1].split('*')) - 1
delta = math.fabs(level - in_ulist)
while delta:
level = len(line[:in_ulist+1].split('*')) - 1
if level > in_ulist:
outstring += ""
in_ulist += 1
else:
outstring += "
"
in_ulist -= 1
delta = math.fabs(level - in_ulist)
line = "- " + line[in_ulist:] + "
"
elif in_ulist:
while in_ulist:
outstring += "
"
in_ulist -= 1
if line.startswith('|'):
if not in_table:
in_table = True
outstring += ""
parts = line.split('|')
output = ""
for p in parts[1:-1]:
if p.startswith("="):
output += "" + p[1:] + " | "
else:
output += "" + p + " | "
output += "
"
line = output
elif in_table:
outstring += "
"
in_table = False
if line == '':
for fmt in formattings:
if fmt[2]:
outstring += '' + fmt[1] + '>'
fmt[2] = False
outstring += ""
continue
if line.startswith('%'): #comments start with %, so just ignore it
continue
if line.startswith('----'):
outstring += "
"
continue
if line.startswith ('{{{'):
outstring += ""
#if you start a line with {{{format, the pre gets its class set to that format
in_pre = True
continue
for h in headings:
if line.startswith(h[0]):
line = line.strip(h[0])
outstring += "<" + h[1] + " id='" + line + "'>"
contents.append((line, h[2]))
line = line + "" + h[1] + ">"
if line.count(r'\\'):
line = line.replace(r'\\','
')
line = re_pre_inline.sub(re_pre_inline_s,line)
line = re_outlink_1.sub(re_outlink_1s,line)
line = re_outlink_2.sub(re_outlink_2s,line)
line = re_image_1.sub(re_image_1s,line)
line = re_image_2.sub(re_image_2s,line)
#these lines protect https and ftps from getting clobbered by the italics
line = line.replace('http://','!http:~~!')
line = line.replace('https://','!https:~~!')
line = line.replace('ftp://','!ftp:~~!')
for fmt in formattings:
while line.count(fmt[0]):
if fmt[2]:
line = line.replace(fmt[0],'' + fmt[1] + '>',1)
fmt[2] = False
else:
line = line.replace(fmt[0],'<' + fmt[1] + '>',1)
fmt[2] = True
line = line.replace('!http:~~!','http://')
line = line.replace('!https:~~!','https://')
line = line.replace('!ftp:~~!','ftp://')
#ine = re_italic.sub(re_italic_s,line)
#ine = re_underline.sub(re_underline_s,line)
outstring += line + "\n"
if USE_TOC:
if len(outstring) > TOC_LENGTH:
outstring = tocmaker(contents) + outstring
return outstring
if __name__ == '__main__':
string = "[[test]]"
#print parse(string)