onpoint/fragments.py

30 lines
878 B
Python

import os
import re
import string
from lxml import etree
FRAGMENT_PREFIX = "in"
FRAGMENT_TAG = "^%s" % FRAGMENT_PREFIX
FRAGMENT_CLASS = "fragment"
# Search and delete the FRAGMENT_TAG anywhere in the given HTML
# while adding the FRAGMENT_CLASS as an attribute
# @returns the HTML with all FRAGMENT_TAG instances shifted to an attribute
def defragmentize(html):
dom = etree.fromstring(html)
fragments = dom.xpath("//*[contains(text(), '%s')]" % FRAGMENT_TAG)
for fragment in fragments:
class_list = fragment.get('class')
if class_list == None:
class_list = FRAGMENT_CLASS
else:
class_list += " %s" % FRAGMENT_CLASS
fragment.set('class', class_list)
fragment.text = re.sub(r"\W*\^%s\W*" % FRAGMENT_PREFIX, '', fragment.text).strip()
return etree.tostring(dom, method='html', encoding='utf-8',
pretty_print=True).decode('utf-8')