import os import re import string from lxml import etree FRAGMENT_TAG = "++" FRAGMENT_CLASS = "fragment" # Search and delete the FRAGMENT_TAG anywhere in the given HTML # while adding the FRAGMENT_CLASS as an attribute # @returns the HTML with all FRAGMENT_TAG instances shifted to an attribute def defragmentize(html): dom = etree.fromstring(html) fragments = dom.xpath("//*[contains(text(), '%s')]" % FRAGMENT_TAG) for fragment in fragments: class_list = fragment.get('class') if class_list == None: class_list = FRAGMENT_CLASS else: class_list += " %s" % FRAGMENT_CLASS fragment.set('class', class_list) fragment.text = re.sub(r"\s*%s\s*" % re.escape(FRAGMENT_TAG), '', fragment.text).strip() return etree.tostring(dom, method='html', encoding='utf-8', pretty_print=True).decode('utf-8')