onpoint/fragments.py

import os
import re
import string

from lxml import etree

FRAGMENT_TAG = "++"
FRAGMENT_CLASS = "fragment"

# Search and delete the FRAGMENT_TAG anywhere in the given HTML
# while adding the FRAGMENT_CLASS as an attribute
# @returns the HTML with all FRAGMENT_TAG instances shifted to an attribute
def defragmentize(html):
  dom = etree.fromstring(html)
  fragments = dom.xpath("//*[contains(text(), '%s')]" % FRAGMENT_TAG)

  for fragment in fragments:
    class_list = fragment.get('class')
    if class_list == None:
      class_list = FRAGMENT_CLASS
    else:
      class_list += " %s" % FRAGMENT_CLASS

    fragment.set('class',  class_list)
    fragment.text = re.sub(r"\s*%s\s*" % re.escape(FRAGMENT_TAG), '', fragment.text)

  return etree.tostring(dom, method='html', encoding='utf-8',
    pretty_print=True).decode('utf-8')