# Copyright 2020 Reid Swanson.
#
# This file is part of scrachy.
#
# scrachy is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# scrachy is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with scrachy. If not, see <https://www.gnu.org/licenses/>.
"""
Middleware for filtering (or ignoring) responses if they are fresh in the
cache.
"""
# Python Modules
import logging
# 3rd Party Modules
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest
import scrapy.settings
# Project Modules
from scrachy.db.engine import session_scope
from scrachy.db.repositories import ResponseRepository
from scrachy.utils.datetime import now_tzaware
from scrachy.utils.request import ExpirationManager
from scrachy.utils.settings import compile_patterns
log = logging.getLogger(__name__)
[docs]
class CachedResponseFilter:
@classmethod
def from_crawler(cls, crawler: Crawler):
return cls(crawler)
[docs]
def __init__(self, crawler: Crawler):
"""
Sometimes you scrape the same domains multiple times looking for
new content. However, when crawling them you might encounter pages that
you have already scraped. If your extraction rules have not changed
since the last crawl it may not be worth reprocessing those pages.
This middleware will look to see if a response corresponding to this
request is already in the cache and is not stale. If the response
is not in the cache ``process_request`` will return immediately.
Otherwise, it will use the following rules to determine whether
the request should be filtered.
* You can specify a set of patterns to match against the
request url. Any pattern that matches part of the url will not be
filtered regardless of whether it is in the cache or
not. This might be useful after changing parsing rules
for a set of pages. These are specified using the
``SCRACHY_CACHED_RESPONSE_FILTER_EXCLUSIONS`` setting, which takes a
list of ``re.Patterns`` or strings which can be compiled to regular
expressions.
* Setting the request meta key, ``dont_filter`` to ``True``, will
not be processed by this middleware.
* Any page that is already excluded from caching via the ``dont_cache``
request meta key will also never be filtered.
Any other request that has a fresh response in the cache will be
filtered.
:param crawler: The current crawler.
"""
settings = crawler.spider.settings if crawler.spider else crawler.settings
self._fingerprinter = crawler.request_fingerprinter
self._exclude_patterns = compile_patterns(settings.get('SCRACHY_CACHED_RESPONSE_FILTER_EXCLUSIONS'))
self._expiration_manager = ExpirationManager(settings)
# noinspection PyUnusedLocal
[docs]
def process_request(self, request: scrapy.http.Request, spider: scrapy.spiders.Spider):
"""
:param request: The Scrapy request.
:param spider: The Scrapy Spider issuing the request.
:raises IgnoreRequest: If the item is already cached, and it does
not meet the requirement to be excluded.
"""
# If dont_cache or dont_skip is set then don't skip the item.
if request.meta.get('dont_cache') or request.meta.get('dont_filter'):
return
url = request.url
fingerprint = self._fingerprinter.fingerprint(request)
# Otherwise check to see if the request is already in the cache and
# skip further processing if it is.
with session_scope() as session:
scrape_timestamp = ResponseRepository(session).find_timestamp_by_fingerprint(fingerprint)
# If the item is not in the cache, then don't filter it
if scrape_timestamp is None:
return
current_timestamp = now_tzaware()
# If the item in the cache is stale, don't filter it.
if self._expiration_manager.is_stale(url, scrape_timestamp, current_timestamp):
return
# If the url is in the exclusion list then don't filter the request
# even if it is already in the cache.
for pattern in self._exclude_patterns:
if pattern.search(url):
return
# Otherwise ignore the cached request
raise IgnoreRequest()