##############################################################################
# Copyright 2020 Reid Swanson.
#
# This file is part of scrachy.
#
# scrachy is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# scrachy is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with scrachy. If not, see <https://www.gnu.org/licenses/>.
##############################################################################
#
# In addition to the terms listed above you must also follow the terms
# set by the 3-Clause BSD license. See the BSD_LICENSE.md file or online
# at <https://opensource.org/licenses/BSD-3-Clause>.
#
##############################################################################
"""
Classes for managing Http Cache Storage.
"""
# Python Modules
import datetime
import logging
import re
from typing import Optional, Any, cast
# 3rd Party Modules
from scrapy.http import Headers, HtmlResponse, Request, Response, TextResponse, XmlResponse
from scrapy.responsetypes import responsetypes
from scrapy.spiders import Spider
from scrapy.settings import Settings
from scrapy.utils.misc import load_object
from scrapy.utils.python import to_bytes, to_unicode
from scrapy.utils.request import RequestFingerprinter
from sqlalchemy import Engine
from w3lib.http import headers_dict_to_raw
# Project Modules
from scrachy.addons import try_import
from scrachy.content import ContentExtractor
from scrachy.db.engine import initialize_engine, session_scope
from scrachy.db.models import Response as CachedResponse, ScrapeHistory
from scrachy.db.repositories import ResponseRepository, ScrapeHistoryRepository
from scrachy.exceptions import InvalidSettingError
from scrachy.http_ import CachedHtmlResponse, CachedTextResponse, CachedXmlResponse
from scrachy.settings import iter_default_settings
from scrachy.settings.defaults import filter as default_filter_settings
from scrachy.settings.defaults import fingerprinter as default_fingerprinter_settings
from scrachy.settings.defaults import storage as default_storage_settings
from scrachy.settings.defaults.storage import RetrievalMethod
from scrachy.utils.datetime import now_tzaware
from scrachy.utils.imports import get_import_path
from scrachy.utils.request import DynamicHashRequestFingerprinter, ExpirationManager
from scrachy.utils.settings import compile_patterns
log = logging.getLogger(__name__)
CLASS_ADAPTER = {
HtmlResponse: CachedHtmlResponse,
TextResponse: CachedTextResponse,
XmlResponse: CachedXmlResponse,
}
[docs]
class BlacklistPolicy:
"""
A wrapper around another cache control policy, but you can also blacklist
urls (exclude from caching) via pattern matching using the
:code:`SCRACHY_POLICY_EXCLUDE_URL_PATTERNS` setting. The patterns must
either be strings that can be compiled with :meth:`re.compile` or
:class:`re.Pattern` objects.
"""
[docs]
def __init__(self, settings: Settings):
super().__init__(settings)
self.base_policy = load_object(settings.get('SCRACHY_POLICY_BASE_CLASS'))(settings)
self.exclude_patterns = compile_patterns(settings.getlist('SCRACHY_POLICY_EXCLUDE_URL_PATTERNS'))
def should_cache_request(self, request: Request) -> bool:
return self.base_policy.should_cache_request(request) and not self.is_excluded(request.url)
def should_cache_response(self, response: Response, request: Request) -> bool:
return self.base_policy.should_cache_response(response, request) and not self.is_excluded(response.url)
def is_excluded(self, url: str) -> bool:
return any([p.match(url) for p in self.exclude_patterns])
[docs]
class AlchemyCacheStorage:
[docs]
def __init__(self, settings: Settings):
"""
This class implements a `scrapy cache storage backend
<https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#writing-your-own-storage-backend>`_
that uses a relational database to store the cached documents.
:param settings: The Scrapy project middleware.
"""
self._settings = settings
self._expiration_manager = ExpirationManager(settings)
self._content_extractor: Optional[ContentExtractor] = self._load_content_extractor(settings)
# The following are created if necessary and set in the `open_spider` method
# The callable used to fingerprint requests
self._fingerprinter: Optional[RequestFingerprinter] = None
# The SqlAlchemy engine used to persist the responses. This is
# either passed in as an existing engine or created from the
# settings in `open_spider`.
# self._engine: Optional[Engine] = None
# If we create the engine ourselves we should dispose of it when
# the spider closes. Otherwise, let the creator deal with it.
# self._dispose_on_close = None
# A factory for creating sessions.
# self._session_factory: Optional[sessionmaker] = None
# region API
# region Properties
# Add properties for all valid middleware. At the expense of being verbose
# this helps avoid typos and validate them (although currently there is
# very little validation).
[docs]
def get(self, name: str, default: Optional[Any] = None) -> Optional[Any]:
"""
Get a ``Scrachy`` setting without having to prefix the name with ``SCRACHY_``.
:return: The value of the setting or ``None`` if it is not set.
"""
return self._settings.get(f"SCRACHY_{name}".upper(), default)
@property
def is_scrapy_fingerprinter(self) -> bool:
return self.fingerprinter_import_path.startswith('scrapy')
@property
def is_scrachy_fingerprinter(self) -> bool:
return self.fingerprinter_import_path.startswith('scrachy')
@property
def fingerprinter_import_path(self) -> str:
return get_import_path(type(self._fingerprinter))
@property
def fingerprinter_implementation(self) -> str:
return self._settings.get('REQUEST_FINGERPRINTER_IMPLEMENTATION')
@property
def fingerprinter_hasher_import_path(self) -> Optional[str]:
if not self.is_scrachy_fingerprinter:
return None
hasher = cast(DynamicHashRequestFingerprinter, self._fingerprinter).hasher
return get_import_path(hasher)
@property
def engine_connect_args(self) -> dict[str, Any]:
return self.get('DB_CONNECT_ARGS', {})
@property
def dialect(self) -> str:
"""
The dialect used for connecting to the database server as specified in
the project middleware. A dialect is always required and should never be
``None``. For supported dialects and drivers see the
`SQLAlchemy website <https://docs.sqlalchemy.org/en/13/core/engines.html#supported-databases>`_.
:return: The dialect name.
"""
return self.get('DB_DIALECT')
@property
def driver(self) -> Optional[str]:
"""
The name of the driver to use with the database or ``None`` to use the
default driver provided by SqlAlchemy.
:return: The driver name.
"""
return self.get('DB_DRIVER')
@property
def host(self) -> Optional[str]:
"""
Returns the host name specified in the project middleware.
:return: The host name.
"""
return self.get('DB_HOST')
@property
def port(self) -> Optional[int]:
"""
The port used to connect to the database as specified in the project
middleware. For sqlite this should return ``None``. ``None`` also
represents the default port for other database dialects. An error
is raised if the port specified in the middleware can not be cast
to an integer.
:return: The port.
:raises ValueError: If the port is not ``None`` and cannot be cast to
an int.
"""
port = self.get('DB_PORT')
try:
return port if port is None else int(port)
except ValueError as e:
log.error(f"If the port is set (not None) it must be an integer but got: {port}")
raise e
@property
def database(self) -> Optional[str]:
"""
The name of the database to connect to. The only time this should be
``None`` is when using an in memory sqlite database (which mostly
defeats the purpose of a cache storage engine).
:return: The name of the database.
"""
return self.get('DB_DATABASE')
@property
def schema(self) -> Optional[str]:
"""
The name of the schema tables will be stored in.
:return: The name of the database.
"""
return self.get('DB_SCHEMA')
@property
def default_encoding(self) -> str:
return self.get('CACHE_DEFAULT_ENCODING', 'utf-8')
@property
def save_history(self) -> bool:
return self.get('CACHE_SAVE_HISTORY')
@property
def activation_delay(self) -> float:
return self.get('CACHE_ACTIVATION_SECS')
@property
def bs4_parser(self) -> str:
"""
The parser to use for parsing HTML with BeautifulSoup.
:return:
"""
return self.get('CONTENT_BS4_PARSER')
@property
def response_retrieval_method(self) -> RetrievalMethod:
"""
The name of the response retrieval method.
This determines how much information to retrieve in the response.
minimal
This returns the minimal amount of information and should be the
fastest because it does not require any joins. However, it will
return null values for the response status and headers. Use this
method of you don't need these or the more detailed information.
standard
This returns the standard information an :class:`scrapy.http.HtmlResponse`
does.
full
This returns a :class:`scrachy.http.CachedResponse`, which contains
all the information available for an item in the cache.
:return: The type of response to retrieve.
"""
return self.get('CACHE_RESPONSE_RETRIEVAL_METHOD')
@property
def expiration_secs(self) -> int:
"""
The value of the scrapy HTTPCACHE_EXPIRATION_SECS setting.
:return: The number of seconds before the cached item becomes stale.
Stale items will be re-downloaded and processed through the
normal pipeline regardless if they are in the cache or not.
"""
return self._settings.getint('HTTPCACHE_EXPIRATION_SECS', 0)
# endregion Properties
# region Scrapy API
[docs]
def open_spider(self, spider: Spider, engine: Optional[Engine] = None):
"""
Connect to the database, validate the middleware and set up the database
tables if necessary.
:param spider: The Scrapy spider.
:param engine: Use this engine instead of creating a new one.
"""
self._fingerprinter = spider.crawler.request_fingerprinter
if self._fingerprinter is None:
raise ValueError(f"The request fingerprinter has not been initialized")
# if engine is not None:
# self._engine = engine
# self._dispose_on_close = False # Don't manage an external engine
# else:
# self._create_engine()
# if engine is None:
# self._create_engine()
# self._create_tables()
# self._dispose_on_close = True
# else:
# self._engine = engine
# self._dispose_on_close = False
initialize_engine(spider.settings)
self.validate_settings()
[docs]
def close_spider(self, spider: Optional[Spider] = None):
"""
Dispose of the SqlAlchemy Engine.
:param spider: The Scrapy spider
"""
# self._engine.dispose()
[docs]
def retrieve_response(
self,
spider: Spider,
request: Request
) -> Optional[CachedTextResponse | CachedXmlResponse | CachedHtmlResponse]:
"""
Retrieves an item from the cache if it exists, otherwise this returns
``None`` to signal downstream processes to continue retrieving the
page normally. Depending on the value of the
``SCRACHY_RESPONSE_RETRIEVAL_METHOD`` setting more or less information
may be returned in the response.
:param spider: The Scrapy Spider requesting the data.
:param request: The request describing what information to retrieve.
:return: If the page is in the cache then this will return a
:class:`~scrachy.http.CachedHtmlResponse`, otherwise it will
return ``None``.
"""
cached_response: CachedResponse = self._read_data(spider, request)
# We didn't find anything (or the item is expired)
if not cached_response:
return None
# Create a new Response from the cached items.
response_retrieval_method = self.response_retrieval_method
# Allways return this info
kwargs = {
'request': request,
'url': request.url,
'body': cached_response.body
}
if response_retrieval_method == 'standard' or response_retrieval_method == 'full':
if cached_response.headers:
raw_headers = to_bytes(cached_response.headers) # noqa
kwargs['headers'] = Headers(raw_headers)
kwargs['status'] = cached_response.status
if response_retrieval_method == 'full':
# This will return (almost) all the data Scrachy has about this
# cached item.
kwargs |= {
'scrape_timestamp': cached_response.scrape_timestamp,
'extracted_text': cached_response.extracted_text,
'body_length': cached_response.body_length,
'extracted_text_length': cached_response.extracted_text_length,
'scrape_history': cached_response.scrape_history
}
return self._make_scrapy_response(**kwargs)
[docs]
def store_response(
self,
spider: Spider,
request: Request,
response: Response
):
"""
Stores the response in the cache.
:param spider: The Scrapy Spider issuing the request.
:param request: The request describing what data is desired.
:param response: The response to be stored in the cache as created by
Scrapy's standard downloading process.
"""
if not isinstance(response, TextResponse):
log.warning(f"The cache only supports TextResponses but received a '{type(response)}'")
return
response = cast(TextResponse, response)
fingerprint: bytes = self._fingerprinter.fingerprint(request)
timestamp = now_tzaware()
cached_response = self._make_cached_response(fingerprint, timestamp, request, response)
# with self.session_scope() as session:
with session_scope() as session:
response_repo = ResponseRepository(session)
response_repo.upsert(cached_response)
if self.save_history:
history_repo = ScrapeHistoryRepository(session)
history_repo.insert(
ScrapeHistory(
fingerprint=fingerprint,
scrape_timestamp=timestamp,
body=response.text
)
)
# endregion Scrapy API
[docs]
def validate_settings(self):
"""
This makes sure that any setting starting with the prefix ``SCRACHY``
is known to the storage backend.
It performs some minor validation like checking to make sure the
port is an integer and a host name is specified unless the dialect is
sqlite. It is still primarily up to the user to ensure the database
connection properties are valid for the type of database being used.
:raises InvalidSettingError: If there are:
* unknown scrachy middleware.
* invalid database middleware.
* an option to a setting that is not valid.
* the hash algorithm specified in the project middleware used to
create the request fingerprint is different from the one
already used for this cache region.
"""
self._validate_unknown_settings()
self._validate_supported_options()
self._validate_database_parameters()
@staticmethod
def clear_cache():
with session_scope() as session:
session.query(Response).delete()
session.query(Request).delete()
[docs]
def dump_cache(self) -> list[Response]:
"""
Dump the contents of the cache. This is not recommended except for
debugging.
:return: A list of SQLAlchemy result objects that contains all
the items in the cache.
"""
with session_scope() as session:
return list(ResponseRepository(session).find_all())
# endregion Extended API
# endregion API
# region Utility Methods
# region Initialization
@staticmethod
def _load_content_extractor(settings: Settings) -> Optional[ContentExtractor]:
extraction_obj = settings.get('SCRACHY_CONTENT_EXTRACTOR')
if not extraction_obj:
return None
return load_object(extraction_obj)(settings)
# endregion Initialization
# region Validation
def _validate_unknown_settings(self):
"""
Check that there aren't any unknown scrachy middleware (e.g., typos).
:raises InvalidSettingError:
"""
storage_re = re.compile(f'^SCRACHY_(CACHE|DB|USE_CONTENT|CONTENT|USE_SIMHASH|SIMHASH)')
storage_keys = set([k for k, _ in iter_default_settings(default_storage_settings)])
finger_keys = set([k for k, _ in iter_default_settings(default_fingerprinter_settings)])
filter_keys = set([k for k, _ in iter_default_settings(default_filter_settings)])
valid_keys = storage_keys | finger_keys | filter_keys
for key, value in self._settings.items():
if storage_re.match(key) and key not in valid_keys:
raise InvalidSettingError(f"Unknown scrachy setting: {key}")
def _validate_supported_options(self):
"""
Make sure the parameters for middleware that take one are valid.
:raises InvalidSettingError:
"""
if 'lxml' in self.bs4_parser:
try_import('lxml', 'AlchemyCacheStorageAddon')
if self.bs4_parser == 'html5lib':
try_import('html5lib', 'AlchemyCacheStorageAddon')
def _validate_database_parameters(self):
"""
Check to make sure the database parameters are valid.
:raises InvalidSettingError:
"""
if self.port and not isinstance(self.port, int):
raise InvalidSettingError(
f"If the port is specified it must be an integer, but was: {self.port}"
)
if self.dialect != 'sqlite' and self.database is None:
raise InvalidSettingError(
"You must specify a database name for dialects except sqlite."
)
if self.dialect == 'sqlite' and self.driver == 'pysqlcipher':
raise InvalidSettingError("The pysqlcipher driver is not supported.")
# region Retrieve Response Utilities
def _read_data(self, spider: Spider, request: Request) -> Optional[CachedResponse]:
# In an effort to save storage space in the previous version I stored
# the diff of the body if there was already an entry. I think this
# was misguided, because it traded disk space for latency by having
# to retrieve multiple objects from the database and then apply the
# diffs to get the actual body. So, in this version I just store
# everything no matter what. It also adds a lot of additional complexity
# that is difficult to maintain.
fingerprint = self._fingerprinter.fingerprint(request)
# with self.session_scope() as session:
with session_scope() as session:
repo = ResponseRepository(session)
scrape_timestamp = repo.find_timestamp_by_fingerprint(fingerprint)
# A missing timestamp indicates the data is not in the cache.
if not scrape_timestamp:
return None
# Don't use the cached data if it is stale.
# Note the CachePolicy does something different. It only uses
# info available from the page itself (e.g., the headers) and
# doesn't know anything about how long (if at all) the data
# has been in our cache.
if self._expiration_manager.is_stale(request.url, scrape_timestamp):
return None
cached_response = repo.find_by_fingerprint(fingerprint, self.response_retrieval_method)
session.expunge_all()
return cached_response
# endregion Retrieve Response Utilities
# region Common Utilities
def _make_cached_response(
self,
fingerprint: bytes,
timestamp: datetime.datetime,
request: Request,
response: Response
):
cached_response = CachedResponse(
fingerprint=fingerprint,
scrape_timestamp=timestamp,
url=request.url,
request_method=request.method,
body=response.text,
status=response.status,
body_length=len(response.body)
)
if request.body is not None:
cached_response.request_body = to_unicode(request.body)
if response.headers is not None:
cached_response.headers = to_unicode(headers_dict_to_raw(response.headers))
if self._content_extractor is not None:
cached_response.extracted_text = self._content_extractor.get_content(response.text)
cached_response.extracted_text_length = len(to_bytes(cached_response.extracted_text))
return cached_response
def _make_scrapy_response(self, **kwargs):
scrapy_cls = responsetypes.from_args(
headers=kwargs.get('headers'),
url=kwargs.get('url'),
body=kwargs.get('body')
)
scrachy_cls = CLASS_ADAPTER[scrapy_cls]
try:
# First try to use the encoding from the cached information
response = scrachy_cls(**kwargs)
except TypeError:
# If that fails, use the default encoding
response = scrachy_cls(encoding=self.default_encoding, **kwargs)
log.warning(
f"Unable to find an appropriate encoding for the page '{kwargs.get('url')}'. "
f"Using '{self.default_encoding}' instead."
)
return response
# endregion Common Utilities
# endregion Utility Methods