Source code for scrachy.content.bs4
# Copyright 2023 Reid Swanson.
#
# This file is part of scrachy.
#
# scrachy is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# scrachy is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with scrachy. If not, see <https://www.gnu.org/licenses/>.
"""
Content extraction using `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/>`__.
"""
# Python Modules
import re
# 3rd Party Modules
import bs4
from scrapy.settings import Settings
# Project Modules
from scrachy.content import BaseContentExtractor
DOM_BLACKLIST: set[str] = {'[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input'}
[docs]
class BeautifulSoupExtractor(BaseContentExtractor):
[docs]
def __init__(self, settings: Settings):
"""
A :class:`ContentExtractor` that uses
`Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
to process the HTML.
The ``SCRACHY_CONTENT_BS4_PARSER`` setting must be set to a valid
parser name.
:param settings: The Scrapy ``Settings``.
"""
super().__init__(settings)
self.parser_name = settings.get('SCRACHY_CONTENT_BS4_PARSER')
[docs]
def get_content(self, html: str) -> str:
"""
Extracts the textual content from the html using a simple algorithm
described
`here <https://matix.io/extract-text-from-webpage-using-beautifulsoup-and-python/>`_.
In short, it ignores blocks that are unlikely to contain meaningful
content, e.g., script blocks, and then strips the tags from the remaining
document.
:param html: The html content as text.
:return: Return the extracted text.
:param html:
:return:
"""
dom = bs4.BeautifulSoup(html, self.parser_name)
# Remove script and style nodes from the DOM
for node in dom(['script', 'style']):
node.extract()
# Find the remaining text nodes
text_nodes = dom.find_all(text=True)
# Only include the text from the nodes that aren't blacklisted
valid_nodes = [t.strip() for t in text_nodes if t.parent.name not in DOM_BLACKLIST]
# Normalize spaces
valid_nodes = [re.sub(r'\s+', ' ', t) for t in valid_nodes]
# Remove blank lines
valid_nodes = [t for t in valid_nodes if t]
return '\n'.join([t for t in valid_nodes])