languagemodels.preprocess
1from html import unescape 2from html.parser import HTMLParser 3 4 5def get_html_paragraphs(src: str): 6 """ 7 Return plain text paragraphs from an HTML source 8 9 :param src: HTML document to convert to plain text paragraphs 10 :return: Plain text paragraphs of document 11 12 This function is designed to be quick rather than robust. 13 14 It follows a simple approach to extracting text: 15 16 1. Ignore all content inside the following elements listed in `ignore`. 17 2. Merge inline text content into paragraphs from `inlines` set. 18 3. Convert any newly merged text element with at least `min_length` 19 characters to a paragraph in the output text. 20 21 >>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read()) 22 'Bolu Province (Turkish: Bolu ili) is a province...' 23 24 >>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read()) 25 "First, the good news. Netflix reported a record ..." 26 """ 27 28 class ParagraphExtractor(HTMLParser): 29 paras = [""] 30 ignoring = [] 31 ignore = ("script", "style", "header", "footer") 32 ignore_attrs = {('hidden', 'hidden'), } 33 inlines = ("a", "b", "i", "span", "sup", "sub", "strong", "em") 34 35 def handle_starttag(self, tag, attrs): 36 if tag in self.ignore or self.ignore_attrs & set(attrs): 37 self.ignoring.append(tag) 38 39 if tag not in self.inlines and self.paras[-1]: 40 self.paras.append("") 41 42 def handle_endtag(self, tag): 43 if self.ignoring and self.ignoring[-1] == tag: 44 self.ignoring.pop() 45 46 if tag not in self.inlines and self.paras[-1]: 47 self.paras.append("") 48 49 def handle_data(self, data): 50 if not self.ignoring: 51 if self.paras and self.paras[-1]: 52 self.paras[-1] += unescape(data) 53 else: 54 self.paras.append(data) 55 56 def get_plain(self): 57 return "\n\n".join([p.rstrip() for p in self.paras if len(p.strip()) > 140]) 58 59 extractor = ParagraphExtractor() 60 extractor.feed(src) 61 return extractor.get_plain()
def
get_html_paragraphs(src: str):
6def get_html_paragraphs(src: str): 7 """ 8 Return plain text paragraphs from an HTML source 9 10 :param src: HTML document to convert to plain text paragraphs 11 :return: Plain text paragraphs of document 12 13 This function is designed to be quick rather than robust. 14 15 It follows a simple approach to extracting text: 16 17 1. Ignore all content inside the following elements listed in `ignore`. 18 2. Merge inline text content into paragraphs from `inlines` set. 19 3. Convert any newly merged text element with at least `min_length` 20 characters to a paragraph in the output text. 21 22 >>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read()) 23 'Bolu Province (Turkish: Bolu ili) is a province...' 24 25 >>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read()) 26 "First, the good news. Netflix reported a record ..." 27 """ 28 29 class ParagraphExtractor(HTMLParser): 30 paras = [""] 31 ignoring = [] 32 ignore = ("script", "style", "header", "footer") 33 ignore_attrs = {('hidden', 'hidden'), } 34 inlines = ("a", "b", "i", "span", "sup", "sub", "strong", "em") 35 36 def handle_starttag(self, tag, attrs): 37 if tag in self.ignore or self.ignore_attrs & set(attrs): 38 self.ignoring.append(tag) 39 40 if tag not in self.inlines and self.paras[-1]: 41 self.paras.append("") 42 43 def handle_endtag(self, tag): 44 if self.ignoring and self.ignoring[-1] == tag: 45 self.ignoring.pop() 46 47 if tag not in self.inlines and self.paras[-1]: 48 self.paras.append("") 49 50 def handle_data(self, data): 51 if not self.ignoring: 52 if self.paras and self.paras[-1]: 53 self.paras[-1] += unescape(data) 54 else: 55 self.paras.append(data) 56 57 def get_plain(self): 58 return "\n\n".join([p.rstrip() for p in self.paras if len(p.strip()) > 140]) 59 60 extractor = ParagraphExtractor() 61 extractor.feed(src) 62 return extractor.get_plain()
Return plain text paragraphs from an HTML source
Parameters
- src: HTML document to convert to plain text paragraphs
Returns
Plain text paragraphs of document
This function is designed to be quick rather than robust.
It follows a simple approach to extracting text:
- Ignore all content inside the following elements listed in
ignore
. - Merge inline text content into paragraphs from
inlines
set. - Convert any newly merged text element with at least
min_length
characters to a paragraph in the output text.
>>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read())
'Bolu Province (Turkish: Bolu ili) is a province...'
>>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read())
"First, the good news. Netflix reported a record ..."