languagemodels.preprocess

 1from html import unescape
 2from html.parser import HTMLParser
 3
 4
 5def get_html_paragraphs(src: str):
 6    """
 7    Return plain text paragraphs from an HTML source
 8
 9    :param src: HTML document to convert to plain text paragraphs
10    :return: Plain text paragraphs of document
11
12    This function is designed to be quick rather than robust.
13
14    It follows a simple approach to extracting text:
15
16    1. Ignore all content inside the following elements listed in `ignore`.
17    2. Merge inline text content into paragraphs from `inlines` set.
18    3. Convert any newly merged text element with at least `min_length`
19    characters to a paragraph in the output text.
20
21    >>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read())
22    'Bolu Province (Turkish: Bolu ili) is a province...'
23
24    >>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read())
25    "First, the good news. Netflix reported a record ..."
26    """
27
28    class ParagraphExtractor(HTMLParser):
29        paras = [""]
30        ignoring = []
31        ignore = ("script", "style", "header", "footer")
32        ignore_attrs = {('hidden', 'hidden'), }
33        inlines = ("a", "b", "i", "span", "sup", "sub", "strong", "em")
34
35        def handle_starttag(self, tag, attrs):
36            if tag in self.ignore or self.ignore_attrs & set(attrs):
37                self.ignoring.append(tag)
38
39            if tag not in self.inlines and self.paras[-1]:
40                self.paras.append("")
41
42        def handle_endtag(self, tag):
43            if self.ignoring and self.ignoring[-1] == tag:
44                self.ignoring.pop()
45
46            if tag not in self.inlines and self.paras[-1]:
47                self.paras.append("")
48
49        def handle_data(self, data):
50            if not self.ignoring:
51                if self.paras and self.paras[-1]:
52                    self.paras[-1] += unescape(data)
53                else:
54                    self.paras.append(data)
55
56        def get_plain(self):
57            return "\n\n".join([p.rstrip() for p in self.paras if len(p.strip()) > 140])
58
59    extractor = ParagraphExtractor()
60    extractor.feed(src)
61    return extractor.get_plain()
def get_html_paragraphs(src: str):
 6def get_html_paragraphs(src: str):
 7    """
 8    Return plain text paragraphs from an HTML source
 9
10    :param src: HTML document to convert to plain text paragraphs
11    :return: Plain text paragraphs of document
12
13    This function is designed to be quick rather than robust.
14
15    It follows a simple approach to extracting text:
16
17    1. Ignore all content inside the following elements listed in `ignore`.
18    2. Merge inline text content into paragraphs from `inlines` set.
19    3. Convert any newly merged text element with at least `min_length`
20    characters to a paragraph in the output text.
21
22    >>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read())
23    'Bolu Province (Turkish: Bolu ili) is a province...'
24
25    >>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read())
26    "First, the good news. Netflix reported a record ..."
27    """
28
29    class ParagraphExtractor(HTMLParser):
30        paras = [""]
31        ignoring = []
32        ignore = ("script", "style", "header", "footer")
33        ignore_attrs = {('hidden', 'hidden'), }
34        inlines = ("a", "b", "i", "span", "sup", "sub", "strong", "em")
35
36        def handle_starttag(self, tag, attrs):
37            if tag in self.ignore or self.ignore_attrs & set(attrs):
38                self.ignoring.append(tag)
39
40            if tag not in self.inlines and self.paras[-1]:
41                self.paras.append("")
42
43        def handle_endtag(self, tag):
44            if self.ignoring and self.ignoring[-1] == tag:
45                self.ignoring.pop()
46
47            if tag not in self.inlines and self.paras[-1]:
48                self.paras.append("")
49
50        def handle_data(self, data):
51            if not self.ignoring:
52                if self.paras and self.paras[-1]:
53                    self.paras[-1] += unescape(data)
54                else:
55                    self.paras.append(data)
56
57        def get_plain(self):
58            return "\n\n".join([p.rstrip() for p in self.paras if len(p.strip()) > 140])
59
60    extractor = ParagraphExtractor()
61    extractor.feed(src)
62    return extractor.get_plain()

Return plain text paragraphs from an HTML source

Parameters
  • src: HTML document to convert to plain text paragraphs
Returns

Plain text paragraphs of document

This function is designed to be quick rather than robust.

It follows a simple approach to extracting text:

  1. Ignore all content inside the following elements listed in ignore.
  2. Merge inline text content into paragraphs from inlines set.
  3. Convert any newly merged text element with at least min_length characters to a paragraph in the output text.
>>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read())
'Bolu Province (Turkish: Bolu ili) is a province...'
>>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read())
"First, the good news. Netflix reported a record ..."