languagemodels.preprocess

View Source

 1from html import unescape
 2from html.parser import HTMLParser
 3
 4
 5def get_html_paragraphs(src: str):
 6    """
 7    Return plain text paragraphs from an HTML source
 8
 9    :param src: HTML document to convert to plain text paragraphs
10    :return: Plain text paragraphs of document
11
12    This function is designed to be quick rather than robust.
13
14    It follows a simple approach to extracting text:
15
16    1. Ignore all content inside the following elements listed in `ignore`.
17    2. Merge inline text content into paragraphs from `inlines` set.
18    3. Convert any newly merged text element with at least `min_length`
19    characters to a paragraph in the output text.
20
21    >>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read())
22    'Bolu Province (Turkish: Bolu ili) is a province...'
23
24    >>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read())
25    "First, the good news. Netflix reported a record ..."
26    """
27
28    class ParagraphExtractor(HTMLParser):
29        paras = [""]
30        ignoring = []
31        ignore = ("script", "style", "header", "footer")
32        ignore_attrs = {('hidden', 'hidden'), }
33        inlines = ("a", "b", "i", "span", "sup", "sub", "strong", "em")
34
35        def handle_starttag(self, tag, attrs):
36            if tag in self.ignore or self.ignore_attrs & set(attrs):
37                self.ignoring.append(tag)
38
39            if tag not in self.inlines and self.paras[-1]:
40                self.paras.append("")
41
42        def handle_endtag(self, tag):
43            if self.ignoring and self.ignoring[-1] == tag:
44                self.ignoring.pop()
45
46            if tag not in self.inlines and self.paras[-1]:
47                self.paras.append("")
48
49        def handle_data(self, data):
50            if not self.ignoring:
51                if self.paras and self.paras[-1]:
52                    self.paras[-1] += unescape(data)
53                else:
54                    self.paras.append(data)
55
56        def get_plain(self):
57            return "\n\n".join([p.rstrip() for p in self.paras if len(p.strip()) > 140])
58
59    extractor = ParagraphExtractor()
60    extractor.feed(src)
61    return extractor.get_plain()

def get_html_paragraphs(src: str): View Source

 6def get_html_paragraphs(src: str):
 7    """
 8    Return plain text paragraphs from an HTML source
 9
10    :param src: HTML document to convert to plain text paragraphs
11    :return: Plain text paragraphs of document
12
13    This function is designed to be quick rather than robust.
14
15    It follows a simple approach to extracting text:
16
17    1. Ignore all content inside the following elements listed in `ignore`.
18    2. Merge inline text content into paragraphs from `inlines` set.
19    3. Convert any newly merged text element with at least `min_length`
20    characters to a paragraph in the output text.
21
22    >>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read())
23    'Bolu Province (Turkish: Bolu ili) is a province...'
24
25    >>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read())
26    "First, the good news. Netflix reported a record ..."
27    """
28
29    class ParagraphExtractor(HTMLParser):
30        paras = [""]
31        ignoring = []
32        ignore = ("script", "style", "header", "footer")
33        ignore_attrs = {('hidden', 'hidden'), }
34        inlines = ("a", "b", "i", "span", "sup", "sub", "strong", "em")
35
36        def handle_starttag(self, tag, attrs):
37            if tag in self.ignore or self.ignore_attrs & set(attrs):
38                self.ignoring.append(tag)
39
40            if tag not in self.inlines and self.paras[-1]:
41                self.paras.append("")
42
43        def handle_endtag(self, tag):
44            if self.ignoring and self.ignoring[-1] == tag:
45                self.ignoring.pop()
46
47            if tag not in self.inlines and self.paras[-1]:
48                self.paras.append("")
49
50        def handle_data(self, data):
51            if not self.ignoring:
52                if self.paras and self.paras[-1]:
53                    self.paras[-1] += unescape(data)
54                else:
55                    self.paras.append(data)
56
57        def get_plain(self):
58            return "\n\n".join([p.rstrip() for p in self.paras if len(p.strip()) > 140])
59
60    extractor = ParagraphExtractor()
61    extractor.feed(src)
62    return extractor.get_plain()

Return plain text paragraphs from an HTML source

Parameters

src: HTML document to convert to plain text paragraphs

Returns

Plain text paragraphs of document

This function is designed to be quick rather than robust.

It follows a simple approach to extracting text:

Ignore all content inside the following elements listed in ignore.
Merge inline text content into paragraphs from inlines set.
Convert any newly merged text element with at least min_length characters to a paragraph in the output text.

>>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read())
'Bolu Province (Turkish: Bolu ili) is a province...'

>>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read())
"First, the good news. Netflix reported a record ..."