Source code for scrachy.content.bs4

#  Copyright 2023 Reid Swanson.
#
#  This file is part of scrachy.
#
#  scrachy is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  scrachy is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public License
#   along with scrachy.  If not, see <https://www.gnu.org/licenses/>.

"""
Content extraction using `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/>`__.
"""

# Python Modules
import re

# 3rd Party Modules
import bs4
from scrapy.settings import Settings

# Project Modules
from scrachy.content import BaseContentExtractor


DOM_BLACKLIST: set[str] = {'[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input'}



[docs]
class BeautifulSoupExtractor(BaseContentExtractor):

[docs]
    def __init__(self, settings: Settings):
        """
        A :class:`ContentExtractor` that uses
        `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
        to process the HTML.

        The ``SCRACHY_CONTENT_BS4_PARSER`` setting must be set to a valid
        parser name.

        :param settings: The Scrapy ``Settings``.
        """
        super().__init__(settings)

        self.parser_name = settings.get('SCRACHY_CONTENT_BS4_PARSER')



[docs]
    def get_content(self, html: str) -> str:
        """
        Extracts the textual content from the html using a simple algorithm
        described
        `here <https://matix.io/extract-text-from-webpage-using-beautifulsoup-and-python/>`_.
        In short, it ignores blocks that are unlikely to contain meaningful
        content, e.g., script blocks, and then strips the tags from the remaining
        document.

        :param html: The html content as text.
        :return: Return the extracted text.

        :param html:
        :return:
        """
        dom = bs4.BeautifulSoup(html, self.parser_name)

        # Remove script and style nodes from the DOM
        for node in dom(['script', 'style']):
            node.extract()

        # Find the remaining text nodes
        text_nodes = dom.find_all(text=True)

        # Only include the text from the nodes that aren't blacklisted
        valid_nodes = [t.strip() for t in text_nodes if t.parent.name not in DOM_BLACKLIST]

        # Normalize spaces
        valid_nodes = [re.sub(r'\s+', ' ', t) for t in valid_nodes]

        # Remove blank lines
        valid_nodes = [t for t in valid_nodes if t]

        return '\n'.join([t for t in valid_nodes])