Source code for scrachy.middleware.httpcache

##############################################################################
#  Copyright 2020 Reid Swanson.
#
#  This file is part of scrachy.
#
#  scrachy is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  scrachy is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public License
#   along with scrachy.  If not, see <https://www.gnu.org/licenses/>.

##############################################################################
#
#   In addition to the terms listed above you must also follow the terms
#   set by the 3-Clause BSD license. See the BSD_LICENSE.md file or online
#   at <https://opensource.org/licenses/BSD-3-Clause>.
#
##############################################################################

"""
Classes for managing Http Cache Storage.
"""

# Python Modules
import datetime
import logging
import re

from typing import Optional, Any, cast

# 3rd Party Modules
from scrapy.http import Headers, HtmlResponse, Request, Response, TextResponse, XmlResponse
from scrapy.responsetypes import responsetypes
from scrapy.spiders import Spider
from scrapy.settings import Settings
from scrapy.utils.misc import load_object
from scrapy.utils.python import to_bytes, to_unicode
from scrapy.utils.request import RequestFingerprinter
from sqlalchemy import Engine
from w3lib.http import headers_dict_to_raw

# Project Modules
from scrachy.addons import try_import
from scrachy.content import ContentExtractor
from scrachy.db.engine import initialize_engine, session_scope
from scrachy.db.models import Response as CachedResponse, ScrapeHistory
from scrachy.db.repositories import ResponseRepository, ScrapeHistoryRepository
from scrachy.exceptions import InvalidSettingError
from scrachy.http_ import CachedHtmlResponse, CachedTextResponse, CachedXmlResponse
from scrachy.settings import iter_default_settings
from scrachy.settings.defaults import filter as default_filter_settings
from scrachy.settings.defaults import fingerprinter as default_fingerprinter_settings
from scrachy.settings.defaults import storage as default_storage_settings
from scrachy.settings.defaults.storage import RetrievalMethod
from scrachy.utils.datetime import now_tzaware
from scrachy.utils.imports import get_import_path
from scrachy.utils.request import DynamicHashRequestFingerprinter, ExpirationManager
from scrachy.utils.settings import compile_patterns

log = logging.getLogger(__name__)


CLASS_ADAPTER = {
    HtmlResponse: CachedHtmlResponse,
    TextResponse: CachedTextResponse,
    XmlResponse: CachedXmlResponse,
}



[docs]
class BlacklistPolicy:
    """
    A wrapper around another cache control policy, but you can also blacklist
    urls (exclude from caching) via pattern matching using the
    :code:`SCRACHY_POLICY_EXCLUDE_URL_PATTERNS` setting. The patterns must
    either be strings that can be compiled with :meth:`re.compile` or
    :class:`re.Pattern` objects.
    """

[docs]
    def __init__(self, settings: Settings):
        super().__init__(settings)

        self.base_policy = load_object(settings.get('SCRACHY_POLICY_BASE_CLASS'))(settings)
        self.exclude_patterns = compile_patterns(settings.getlist('SCRACHY_POLICY_EXCLUDE_URL_PATTERNS'))


    def should_cache_request(self, request: Request) -> bool:
        return self.base_policy.should_cache_request(request) and not self.is_excluded(request.url)

    def should_cache_response(self, response: Response, request: Request) -> bool:
        return self.base_policy.should_cache_response(response, request) and not self.is_excluded(response.url)

    def is_excluded(self, url: str) -> bool:
        return any([p.match(url) for p in self.exclude_patterns])




[docs]
class AlchemyCacheStorage:

[docs]
    def __init__(self, settings: Settings):
        """
        This class implements a `scrapy cache storage backend
        <https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#writing-your-own-storage-backend>`_
        that uses a relational database to store the cached documents.

        :param settings: The Scrapy project middleware.
        """
        self._settings = settings

        self._expiration_manager = ExpirationManager(settings)
        self._content_extractor: Optional[ContentExtractor] = self._load_content_extractor(settings)

        # The following are created if necessary and set in the `open_spider` method
        # The callable used to fingerprint requests
        self._fingerprinter: Optional[RequestFingerprinter] = None


        # The SqlAlchemy engine used to persist the responses. This is
        # either passed in as an existing engine or created from the
        # settings in `open_spider`.
        # self._engine: Optional[Engine] = None

        # If we create the engine ourselves we should dispose of it when
        # the spider closes. Otherwise, let the creator deal with it.
        # self._dispose_on_close = None

        # A factory for creating sessions.
        # self._session_factory: Optional[sessionmaker] = None

    # region API
    # region Properties
    # Add properties for all valid middleware. At the expense of being verbose
    # this helps avoid typos and validate them (although currently there is
    # very little validation).

[docs]
    def get(self, name: str, default: Optional[Any] = None) -> Optional[Any]:
        """
        Get a ``Scrachy`` setting without having to prefix the name with ``SCRACHY_``.

        :return: The value of the setting or ``None`` if it is not set.
        """
        return self._settings.get(f"SCRACHY_{name}".upper(), default)


    @property
    def is_scrapy_fingerprinter(self) -> bool:
        return self.fingerprinter_import_path.startswith('scrapy')

    @property
    def is_scrachy_fingerprinter(self) -> bool:
        return self.fingerprinter_import_path.startswith('scrachy')

    @property
    def fingerprinter_import_path(self) -> str:
        return get_import_path(type(self._fingerprinter))

    @property
    def fingerprinter_implementation(self) -> str:
        return self._settings.get('REQUEST_FINGERPRINTER_IMPLEMENTATION')

    @property
    def fingerprinter_hasher_import_path(self) -> Optional[str]:
        if not self.is_scrachy_fingerprinter:
            return None

        hasher = cast(DynamicHashRequestFingerprinter, self._fingerprinter).hasher

        return get_import_path(hasher)

    @property
    def engine_connect_args(self) -> dict[str, Any]:
        return self.get('DB_CONNECT_ARGS', {})

    @property
    def dialect(self) -> str:
        """
        The dialect used for connecting to the database server as specified in
        the project middleware. A dialect is always required and should never be
        ``None``. For supported dialects and drivers see the
        `SQLAlchemy website <https://docs.sqlalchemy.org/en/13/core/engines.html#supported-databases>`_.

        :return: The dialect name.
        """
        return self.get('DB_DIALECT')

    @property
    def driver(self) -> Optional[str]:
        """
        The name of the driver to use with the database or ``None`` to use the
        default driver provided by SqlAlchemy.

        :return: The driver name.
        """
        return self.get('DB_DRIVER')

    @property
    def host(self) -> Optional[str]:
        """
        Returns the host name specified in the project middleware.

        :return: The host name.
        """
        return self.get('DB_HOST')

    @property
    def port(self) -> Optional[int]:
        """
        The port used to connect to the database as specified in the project
        middleware. For sqlite this should return ``None``. ``None`` also
        represents the default port for other database dialects. An error
        is raised if the port specified in the middleware can not be cast
        to an integer.

        :return: The port.
        :raises ValueError: If the port is not ``None`` and cannot be cast to
                an int.
        """
        port = self.get('DB_PORT')

        try:
            return port if port is None else int(port)
        except ValueError as e:
            log.error(f"If the port is set (not None) it must be an integer but got: {port}")
            raise e

    @property
    def database(self) -> Optional[str]:
        """
        The name of the database to connect to. The only time this should be
        ``None`` is when using an in memory sqlite database (which mostly
        defeats the purpose of a cache storage engine).

        :return: The name of the database.
        """
        return self.get('DB_DATABASE')

    @property
    def schema(self) -> Optional[str]:
        """
        The name of the schema tables will be stored in.

        :return: The name of the database.
        """
        return self.get('DB_SCHEMA')

    @property
    def default_encoding(self) -> str:
        return self.get('CACHE_DEFAULT_ENCODING', 'utf-8')

    @property
    def save_history(self) -> bool:
        return self.get('CACHE_SAVE_HISTORY')

    @property
    def activation_delay(self) -> float:
        return self.get('CACHE_ACTIVATION_SECS')

    @property
    def bs4_parser(self) -> str:
        """
        The parser to use for parsing HTML with BeautifulSoup.

        :return:
        """
        return self.get('CONTENT_BS4_PARSER')

    @property
    def response_retrieval_method(self) -> RetrievalMethod:
        """
        The name of the response retrieval method.

        This determines how much information to retrieve in the response.

        minimal
            This returns the minimal amount of information and should be the
            fastest because it does not require any joins. However, it will
            return null values for the response status and headers. Use this
            method of you don't need these or the more detailed information.
        standard
            This returns the standard information an :class:`scrapy.http.HtmlResponse`
            does.
        full
            This returns a :class:`scrachy.http.CachedResponse`, which contains
            all the information available for an item in the cache.

        :return: The type of response to retrieve.
        """
        return self.get('CACHE_RESPONSE_RETRIEVAL_METHOD')

    @property
    def expiration_secs(self) -> int:
        """
        The value of the scrapy HTTPCACHE_EXPIRATION_SECS setting.

        :return: The number of seconds before the cached item becomes stale.
                 Stale items will be re-downloaded and processed through the
                 normal pipeline regardless if they are in the cache or not.
        """
        return self._settings.getint('HTTPCACHE_EXPIRATION_SECS', 0)
    # endregion Properties

    # region Scrapy API

[docs]
    def open_spider(self, spider: Spider, engine: Optional[Engine] = None):
        """
        Connect to the database, validate the middleware and set up the database
        tables if necessary.

        :param spider: The Scrapy spider.
        :param engine: Use this engine instead of creating a new one.
        """
        self._fingerprinter = spider.crawler.request_fingerprinter

        if self._fingerprinter is None:
            raise ValueError(f"The request fingerprinter has not been initialized")

        # if engine is not None:
        #     self._engine = engine
        #     self._dispose_on_close = False  # Don't manage an external engine
        # else:
        #     self._create_engine()
        # if engine is None:
        #     self._create_engine()
        #     self._create_tables()
        #     self._dispose_on_close = True
        # else:
        #     self._engine = engine
        #     self._dispose_on_close = False
        initialize_engine(spider.settings)

        self.validate_settings()



[docs]
    def close_spider(self, spider: Optional[Spider] = None):
        """
        Dispose of the SqlAlchemy Engine.

        :param spider: The Scrapy spider
        """


        # self._engine.dispose()


[docs]
    def retrieve_response(
            self,
            spider: Spider,
            request: Request
    ) -> Optional[CachedTextResponse | CachedXmlResponse | CachedHtmlResponse]:
        """
        Retrieves an item from the cache if it exists, otherwise this returns
        ``None`` to signal downstream processes to continue retrieving the
        page normally. Depending on the value of the
        ``SCRACHY_RESPONSE_RETRIEVAL_METHOD`` setting more or less information
        may be returned in the response.

        :param spider: The Scrapy Spider requesting the data.
        :param request: The request describing what information to retrieve.
        :return: If the page is in the cache then this will return a
                 :class:`~scrachy.http.CachedHtmlResponse`, otherwise it will
                 return ``None``.
        """
        cached_response: CachedResponse = self._read_data(spider, request)

        # We didn't find anything (or the item is expired)
        if not cached_response:
            return None

        # Create a new Response from the cached items.
        response_retrieval_method = self.response_retrieval_method

        # Allways return this info
        kwargs = {
            'request': request,
            'url': request.url,
            'body': cached_response.body
        }

        if response_retrieval_method == 'standard' or response_retrieval_method == 'full':
            if cached_response.headers:
                raw_headers = to_bytes(cached_response.headers)  # noqa
                kwargs['headers'] = Headers(raw_headers)

            kwargs['status'] = cached_response.status

        if response_retrieval_method == 'full':
            # This will return (almost) all the data Scrachy has about this
            # cached item.
            kwargs |= {
                'scrape_timestamp': cached_response.scrape_timestamp,
                'extracted_text': cached_response.extracted_text,
                'body_length': cached_response.body_length,
                'extracted_text_length': cached_response.extracted_text_length,
                'scrape_history': cached_response.scrape_history
            }

        return self._make_scrapy_response(**kwargs)



[docs]
    def store_response(
            self,
            spider: Spider,
            request: Request,
            response: Response
    ):
        """
        Stores the response in the cache.

        :param spider: The Scrapy Spider issuing the request.
        :param request: The request describing what data is desired.
        :param response: The response to be stored in the cache as created by
               Scrapy's standard downloading process.
        """
        if not isinstance(response, TextResponse):
            log.warning(f"The cache only supports TextResponses but received a '{type(response)}'")
            return

        response = cast(TextResponse, response)

        fingerprint: bytes = self._fingerprinter.fingerprint(request)
        timestamp = now_tzaware()
        cached_response = self._make_cached_response(fingerprint, timestamp, request, response)

        # with self.session_scope() as session:
        with session_scope() as session:
            response_repo = ResponseRepository(session)
            response_repo.upsert(cached_response)

            if self.save_history:
                history_repo = ScrapeHistoryRepository(session)
                history_repo.insert(
                    ScrapeHistory(
                        fingerprint=fingerprint,
                        scrape_timestamp=timestamp,
                        body=response.text
                    )
                )

    # endregion Scrapy API


[docs]
    def validate_settings(self):
        """
        This makes sure that any setting starting with the prefix ``SCRACHY``
        is known to the storage backend.

        It performs some minor validation like checking to make sure the
        port is an integer and a host name is specified unless the dialect is
        sqlite. It is still primarily up to the user to ensure the database
        connection properties are valid for the type of database being used.

        :raises InvalidSettingError: If there are:

                * unknown scrachy middleware.
                * invalid database middleware.
                * an option to a setting that is not valid.
                * the hash algorithm specified in the project middleware used to
                  create the request fingerprint is different from the one
                  already used for this cache region.
        """
        self._validate_unknown_settings()
        self._validate_supported_options()
        self._validate_database_parameters()


    @staticmethod
    def clear_cache():
        with session_scope() as session:
            session.query(Response).delete()
            session.query(Request).delete()


[docs]
    def dump_cache(self) -> list[Response]:
        """
        Dump the contents of the cache. This is not recommended except for
        debugging.

        :return: A list of SQLAlchemy result objects that contains all
                 the items in the cache.
        """
        with session_scope() as session:
            return list(ResponseRepository(session).find_all())

    # endregion Extended API
    # endregion API

    # region Utility Methods
    # region Initialization
    @staticmethod
    def _load_content_extractor(settings: Settings) -> Optional[ContentExtractor]:
        extraction_obj = settings.get('SCRACHY_CONTENT_EXTRACTOR')

        if not extraction_obj:
            return None

        return load_object(extraction_obj)(settings)
    # endregion Initialization

    # region Validation
    def _validate_unknown_settings(self):
        """
        Check that there aren't any unknown scrachy middleware (e.g., typos).

        :raises InvalidSettingError:
        """
        storage_re = re.compile(f'^SCRACHY_(CACHE|DB|USE_CONTENT|CONTENT|USE_SIMHASH|SIMHASH)')
        storage_keys = set([k for k, _ in iter_default_settings(default_storage_settings)])
        finger_keys = set([k for k, _ in iter_default_settings(default_fingerprinter_settings)])
        filter_keys = set([k for k, _ in iter_default_settings(default_filter_settings)])

        valid_keys = storage_keys | finger_keys | filter_keys

        for key, value in self._settings.items():
            if storage_re.match(key) and key not in valid_keys:
                raise InvalidSettingError(f"Unknown scrachy setting: {key}")

    def _validate_supported_options(self):
        """
        Make sure the parameters for middleware that take one are valid.

        :raises InvalidSettingError:
        """
        if 'lxml' in self.bs4_parser:
            try_import('lxml', 'AlchemyCacheStorageAddon')

        if self.bs4_parser == 'html5lib':
            try_import('html5lib', 'AlchemyCacheStorageAddon')

    def _validate_database_parameters(self):
        """
        Check to make sure the database parameters are valid.

        :raises InvalidSettingError:
        """

        if self.port and not isinstance(self.port, int):
            raise InvalidSettingError(
                f"If the port is specified it must be an integer, but was: {self.port}"
            )

        if self.dialect != 'sqlite' and self.database is None:
            raise InvalidSettingError(
                "You must specify a database name for dialects except sqlite."
            )

        if self.dialect == 'sqlite' and self.driver == 'pysqlcipher':
            raise InvalidSettingError("The pysqlcipher driver is not supported.")

    # region Retrieve Response Utilities
    def _read_data(self, spider: Spider, request: Request) -> Optional[CachedResponse]:
        # In an effort to save storage space in the previous version I stored
        # the diff of the body if there was already an entry. I think this
        # was misguided, because it traded disk space for latency by having
        # to retrieve multiple objects from the database and then apply the
        # diffs to get the actual body. So, in this version I just store
        # everything no matter what. It also adds a lot of additional complexity
        # that is difficult to maintain.
        fingerprint = self._fingerprinter.fingerprint(request)

        # with self.session_scope() as session:
        with session_scope() as session:
            repo = ResponseRepository(session)
            scrape_timestamp = repo.find_timestamp_by_fingerprint(fingerprint)

            # A missing timestamp indicates the data is not in the cache.
            if not scrape_timestamp:
                return None

            # Don't use the cached data if it is stale.
            # Note the CachePolicy does something different. It only uses
            # info available from the page itself (e.g., the headers) and
            # doesn't know anything about how long (if at all) the data
            # has been in our cache.
            if self._expiration_manager.is_stale(request.url, scrape_timestamp):
                return None

            cached_response = repo.find_by_fingerprint(fingerprint, self.response_retrieval_method)

            session.expunge_all()

        return cached_response
    # endregion Retrieve Response Utilities

    # region Common Utilities
    def _make_cached_response(
            self,
            fingerprint: bytes,
            timestamp: datetime.datetime,
            request: Request,
            response: Response
    ):
        cached_response = CachedResponse(
            fingerprint=fingerprint,
            scrape_timestamp=timestamp,
            url=request.url,
            request_method=request.method,
            body=response.text,
            status=response.status,
            body_length=len(response.body)
        )

        if request.body is not None:
            cached_response.request_body = to_unicode(request.body)

        if response.headers is not None:
            cached_response.headers = to_unicode(headers_dict_to_raw(response.headers))

        if self._content_extractor is not None:
            cached_response.extracted_text = self._content_extractor.get_content(response.text)
            cached_response.extracted_text_length = len(to_bytes(cached_response.extracted_text))

        return cached_response

    def _make_scrapy_response(self, **kwargs):
        scrapy_cls = responsetypes.from_args(
            headers=kwargs.get('headers'),
            url=kwargs.get('url'),
            body=kwargs.get('body')
        )
        scrachy_cls = CLASS_ADAPTER[scrapy_cls]

        try:
            # First try to use the encoding from the cached information
            response = scrachy_cls(**kwargs)
        except TypeError:
            # If that fails, use the default encoding
            response = scrachy_cls(encoding=self.default_encoding, **kwargs)

            log.warning(
                f"Unable to find an appropriate encoding for the page '{kwargs.get('url')}'. "
                f"Using '{self.default_encoding}' instead."
            )

        return response

    # endregion Common Utilities
    # endregion Utility Methods