29 lines
834 B
Python
29 lines
834 B
Python
import os
|
|
import re
|
|
import string
|
|
|
|
from lxml import etree
|
|
|
|
FRAGMENT_TAG = "++"
|
|
FRAGMENT_CLASS = "fragment"
|
|
|
|
# Search and delete the FRAGMENT_TAG anywhere in the given HTML
|
|
# while adding the FRAGMENT_CLASS as an attribute
|
|
# @returns the HTML with all FRAGMENT_TAG instances shifted to an attribute
|
|
def defragmentize(html):
|
|
dom = etree.fromstring(html)
|
|
fragments = dom.xpath("//*[contains(text(), '%s')]" % FRAGMENT_TAG)
|
|
|
|
for fragment in fragments:
|
|
class_list = fragment.get('class')
|
|
if class_list == None:
|
|
class_list = FRAGMENT_CLASS
|
|
else:
|
|
class_list += " %s" % FRAGMENT_CLASS
|
|
|
|
fragment.set('class', class_list)
|
|
fragment.text = re.sub(r"\s*%s\s*" % re.escape(FRAGMENT_TAG), '', fragment.text)
|
|
|
|
return etree.tostring(dom, method='html', encoding='utf-8',
|
|
pretty_print=True).decode('utf-8')
|