diff --git a/fragments.py b/fragments.py index cf3ffcb..f9b49e8 100644 --- a/fragments.py +++ b/fragments.py @@ -10,9 +10,19 @@ FRAGMENT_CLASS = "fragment" # Search and delete the FRAGMENT_TAG anywhere in the given HTML # while adding the FRAGMENT_CLASS as an attribute # @returns the HTML with all FRAGMENT_TAG instances shifted to an attribute + + def defragmentize(html): dom = etree.fromstring(html) - fragments = dom.xpath("//*[contains(text(), '%s')]" % FRAGMENT_TAG) + query = "|".join([ + # FRAGMENT_TAG between two spaces + "//*[contains(text(), ' %s ')]" % FRAGMENT_TAG, + # FRAGMENT_TAG as first element followed by a space + "//*[starts-with(text(), '%s ')]" % FRAGMENT_TAG, + # FRAGMENT_TAG after a tab, followed by a space + "//*[contains(text(), '\t%s ')]" % FRAGMENT_TAG + ]) + fragments = dom.xpath(query) for fragment in fragments: class_list = fragment.get('class') @@ -22,7 +32,8 @@ def defragmentize(html): class_list += " %s" % FRAGMENT_CLASS fragment.set('class', class_list) - fragment.text = re.sub(r"\s*%s\s*" % re.escape(FRAGMENT_TAG), '', fragment.text) + fragment.text = re.sub(r"\s*%s\s*" % + re.escape(FRAGMENT_TAG), '', fragment.text) return etree.tostring(dom, method='html', encoding='utf-8', - pretty_print=True).decode('utf-8') + pretty_print=True).decode('utf-8')