import os import re import string from lxml import etree FRAGMENT_TAG = "++" FRAGMENT_CLASS = "fragment" # Search and delete the FRAGMENT_TAG anywhere in the given HTML # while adding the FRAGMENT_CLASS as an attribute # @returns the HTML with all FRAGMENT_TAG instances shifted to an attribute def defragmentize(html): dom = etree.fromstring(html) query = "|".join([ # FRAGMENT_TAG between two spaces "//*[contains(text(), ' %s ')]" % FRAGMENT_TAG, # FRAGMENT_TAG as first element followed by a space "//*[starts-with(text(), '%s ')]" % FRAGMENT_TAG, # FRAGMENT_TAG after a tab, followed by a space "//*[contains(text(), '\t%s ')]" % FRAGMENT_TAG ]) fragments = dom.xpath(query) for fragment in fragments: class_list = fragment.get('class') if class_list == None: class_list = FRAGMENT_CLASS else: class_list += " %s" % FRAGMENT_CLASS fragment.set('class', class_list) fragment.text = re.sub(r"\s*%s\s*" % re.escape(FRAGMENT_TAG), '', fragment.text) return etree.tostring(dom, method='html', encoding='utf-8', pretty_print=True, doctype="").decode('utf-8')