add utils to do more cleanups in html

tpeng · tpeng · commit 84e5d46b7cc0 · 2013-11-01T14:57:00.000+08:00
e.g. replace the header to more general tag.
 the idea is to have make websturct works on a "generalized" html.
diff --git a/webstruct/feature_extraction.py b/webstruct/feature_extraction.py
@@ -6,6 +6,7 @@
 from sklearn.base import BaseEstimator
 from .preprocess import IobSequence, Tagset, to_features_and_labels
 from .tokenize import default_tokenizer
+from .htmls import replace_tags, kill_tags
 
 _cleaner = lxml.html.clean.Cleaner(
     style=True,
@@ -80,7 +81,10 @@ def clean_html(cls, html, encoding=None):
         return _cleaner.clean_html(html)
 
     def _parse_html(self, html, encoding=None):
-        return self.clean_html(html, encoding)
+        doc = self.clean_html(html, encoding)
+        doc = replace_tags(doc, {'h3', 'h4', 'b'}, 'strong')
+        doc = kill_tags(doc, {'br'}, keep_child=False)
+        return doc
 
     def fit_transform(self, X, y=None, encoding=None):
         """
diff --git a/webstruct/htmls.py b/webstruct/htmls.py
@@ -0,0 +1,45 @@
+from lxml.etree import iterwalk
+
+
+def replace_tags(root, tags, name):
+    """
+    Replace lxml element's tag.
+
+    >>> from lxml.html import fragment_fromstring, document_fromstring, tostring
+    >>> root = fragment_fromstring('<h1>head 1</h1>')
+    >>> root = replace_tags(root, {'h1'}, 'strong')
+    >>> tostring(root)
+    '<strong>head 1</strong>'
+
+    >>> root = document_fromstring('<h1>head 1</h1> <h2>head 2</h2>')
+    >>> root = replace_tags(root, {'h1','h2','h3','h4'}, 'strong')
+    >>> tostring(root)
+    '<html><body><strong>head 1</strong> <strong>head 2</strong></body></html>'
+    """
+    for tag in tags:
+        for e in root.iter(tag):
+            e.tag = name
+    return root
+
+def kill_tags(root, tags, keep_child=True):
+    """
+    >>> from lxml.html import fragment_fromstring, tostring
+    >>> root = fragment_fromstring('<div><h1>head 1</h1></div>')
+    >>> root = kill_tags(root, {'h1'})
+    >>> tostring(root)
+    '<div>head 1</div>'
+
+    >>> root = fragment_fromstring('<div><h1>head 1</h1></div>')
+    >>> root = kill_tags(root, {'h1'}, False)
+    >>> tostring(root)
+    '<div></div>'
+    """
+    for _, elem in iterwalk(root):
+        if elem.tag in tags:
+            if keep_child:
+                elem.drop_tag()
+            else:
+                elem.drop_tree()
+    return root
+
+