Source code for scrachy.content

#  Copyright 2023 Reid Swanson.
#
#  This file is part of scrachy.
#
#  scrachy is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  scrachy is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public License
#   along with scrachy.  If not, see <https://www.gnu.org/licenses/>.

"""
Classes and utilities for extracting textual content from the HTML body.
"""

# Python Modules
from typing import Protocol

from scrapy.settings import Settings


# 3rd Party Modules

# Project Modules



[docs]
class ContentExtractor(Protocol):

[docs]
    def get_content(self, html: str) -> str:
        """
        Get the desired textual content from the HTML.

        :param html: The textual HTML to process.
        :return: The desired content (e.g., text with tags removed).
        """
        pass





[docs]
class BaseContentExtractor(ContentExtractor):

[docs]
    def __init__(self, settings: Settings):
        """
        A content extractor base class that keeps track of the project
        middleware.

        :param settings: The Scrapy ``Settings``.
        """
        super().__init__()

        self.settings = settings