Skip to content

Commit 84e5d46

Browse files
committed
add utils to do more cleanups in html
e.g. replace the header to more general tag. the idea is to have make websturct works on a "generalized" html.
1 parent 44e1e3c commit 84e5d46

File tree

2 files changed

+50
-1
lines changed

2 files changed

+50
-1
lines changed

webstruct/feature_extraction.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from sklearn.base import BaseEstimator
77
from .preprocess import IobSequence, Tagset, to_features_and_labels
88
from .tokenize import default_tokenizer
9+
from .htmls import replace_tags, kill_tags
910

1011
_cleaner = lxml.html.clean.Cleaner(
1112
style=True,
@@ -80,7 +81,10 @@ def clean_html(cls, html, encoding=None):
8081
return _cleaner.clean_html(html)
8182

8283
def _parse_html(self, html, encoding=None):
83-
return self.clean_html(html, encoding)
84+
doc = self.clean_html(html, encoding)
85+
doc = replace_tags(doc, {'h3', 'h4', 'b'}, 'strong')
86+
doc = kill_tags(doc, {'br'}, keep_child=False)
87+
return doc
8488

8589
def fit_transform(self, X, y=None, encoding=None):
8690
"""

webstruct/htmls.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from lxml.etree import iterwalk
2+
3+
4+
def replace_tags(root, tags, name):
5+
"""
6+
Replace lxml element's tag.
7+
8+
>>> from lxml.html import fragment_fromstring, document_fromstring, tostring
9+
>>> root = fragment_fromstring('<h1>head 1</h1>')
10+
>>> root = replace_tags(root, {'h1'}, 'strong')
11+
>>> tostring(root)
12+
'<strong>head 1</strong>'
13+
14+
>>> root = document_fromstring('<h1>head 1</h1> <h2>head 2</h2>')
15+
>>> root = replace_tags(root, {'h1','h2','h3','h4'}, 'strong')
16+
>>> tostring(root)
17+
'<html><body><strong>head 1</strong> <strong>head 2</strong></body></html>'
18+
"""
19+
for tag in tags:
20+
for e in root.iter(tag):
21+
e.tag = name
22+
return root
23+
24+
def kill_tags(root, tags, keep_child=True):
25+
"""
26+
>>> from lxml.html import fragment_fromstring, tostring
27+
>>> root = fragment_fromstring('<div><h1>head 1</h1></div>')
28+
>>> root = kill_tags(root, {'h1'})
29+
>>> tostring(root)
30+
'<div>head 1</div>'
31+
32+
>>> root = fragment_fromstring('<div><h1>head 1</h1></div>')
33+
>>> root = kill_tags(root, {'h1'}, False)
34+
>>> tostring(root)
35+
'<div></div>'
36+
"""
37+
for _, elem in iterwalk(root):
38+
if elem.tag in tags:
39+
if keep_child:
40+
elem.drop_tag()
41+
else:
42+
elem.drop_tree()
43+
return root
44+
45+

0 commit comments

Comments
 (0)