Add more specific fragment tag identification
This commit is contained in:
parent
9b05df056f
commit
4ce0b0dcd6
17
fragments.py
17
fragments.py
@ -10,9 +10,19 @@ FRAGMENT_CLASS = "fragment"
|
||||
# Search and delete the FRAGMENT_TAG anywhere in the given HTML
|
||||
# while adding the FRAGMENT_CLASS as an attribute
|
||||
# @returns the HTML with all FRAGMENT_TAG instances shifted to an attribute
|
||||
|
||||
|
||||
def defragmentize(html):
|
||||
dom = etree.fromstring(html)
|
||||
fragments = dom.xpath("//*[contains(text(), '%s')]" % FRAGMENT_TAG)
|
||||
query = "|".join([
|
||||
# FRAGMENT_TAG between two spaces
|
||||
"//*[contains(text(), ' %s ')]" % FRAGMENT_TAG,
|
||||
# FRAGMENT_TAG as first element followed by a space
|
||||
"//*[starts-with(text(), '%s ')]" % FRAGMENT_TAG,
|
||||
# FRAGMENT_TAG after a tab, followed by a space
|
||||
"//*[contains(text(), '\t%s ')]" % FRAGMENT_TAG
|
||||
])
|
||||
fragments = dom.xpath(query)
|
||||
|
||||
for fragment in fragments:
|
||||
class_list = fragment.get('class')
|
||||
@ -22,7 +32,8 @@ def defragmentize(html):
|
||||
class_list += " %s" % FRAGMENT_CLASS
|
||||
|
||||
fragment.set('class', class_list)
|
||||
fragment.text = re.sub(r"\s*%s\s*" % re.escape(FRAGMENT_TAG), '', fragment.text)
|
||||
fragment.text = re.sub(r"\s*%s\s*" %
|
||||
re.escape(FRAGMENT_TAG), '', fragment.text)
|
||||
|
||||
return etree.tostring(dom, method='html', encoding='utf-8',
|
||||
pretty_print=True).decode('utf-8')
|
||||
pretty_print=True).decode('utf-8')
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user