Support for TV show episode information scraping

This commit is contained in:
C5H12O5
2023-08-11 01:13:16 +08:00
parent ed168eea92
commit 98cb4b54b8
7 changed files with 136 additions and 17 deletions

View File

@ -45,8 +45,8 @@
"movie": {
"title": "xtext: ./name",
"tagline": "",
"certificate": "",
"original_available": "xtext: ./datePublished",
"certificate": "",
"genre": "xtexts: ./genre/*",
"actor": "xtexts: ./actor//name",
"writer": "xtexts: ./author//name",

View File

@ -0,0 +1,109 @@
{
"type": "tvshow_episode",
"site": "douban.com",
"steps": [
{
"http": {
"url": "https://www.douban.com/search?cat=1002&q={title}",
"method": "GET",
"headers": {
"Referer": "https://www.douban.com/",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
},
"result": "metadata"
}
},
{
"collect": {
"source": "metadata",
"into": {
"ids": "matches: 电视剧].*?sid:\\s*(\\d+)\\s*,"
}
}
},
{
"loop": {
"source": "ids",
"item": "id",
"steps": [
{
"http": {
"url": "https://movie.douban.com/subject/{id}/",
"method": "GET",
"headers": {
"Host": "movie.douban.com",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
},
"result": "subject"
}
},
{
"collect": {
"source": "subject",
"from": "xtext: .//script[@type='application/ld+json']",
"into": {
"episode": {
"title": "xtext: ./name",
"tagline": "",
"original_available": "xtext: ./datePublished",
"certificate": "",
"genre": "xtexts: ./genre/*",
"actor": "xtexts: ./actor//name",
"writer": "xtexts: ./author//name",
"director": "xtexts: ./director//name",
"season": "{$parent[season]}",
"episode": "{$parent[episode]}",
"extra": {
"douban.com": {
"tvshow": {
"title": "xtext: ./name",
"original_available": "xtext: ./datePublished",
"extra": {
"douban.com": {
"poster": "xtexts: ./image",
"backdrop": "xtexts: ./image"
}
}
},
"rating": {
"douban.com": "xtext: .//ratingValue"
},
"poster": "xtexts: ./image"
}
}
}
}
}
},
{
"collect": {
"source": "subject",
"into": {
"episode": {
"summary": "xtext: .//span[@property='v:summary']",
"extra": {
"douban.com": {
"tvshow": {
"summary": "xtext: .//span[@property='v:summary']"
}
}
}
}
}
}
},
{
"replace": {
"source": "episode",
"pattern": "(.+/photo)/s_ratio_poster/(public/.+)",
"replacement": "\\1/m/\\2"
}
},
{
"retval": "episode"
}
]
}
}
]
}

View File

@ -6,7 +6,7 @@ import logging
import pkgutil
from abc import ABC, abstractmethod
from functools import wraps
from typing import Any, Callable, Optional, Type, Union
from typing import Any, Callable, Type
_logger = logging.getLogger(__name__)
@ -22,9 +22,7 @@ class Args(ABC):
pass
@staticmethod
def substitute(
obj: Optional[Union[str, list, dict]], context: dict
) -> Optional[Union[str, list, dict]]:
def substitute(obj: Any, context: dict) -> Any:
"""Recursively substitute strings in an object with given context."""
if isinstance(obj, str):
return obj.format(**context)

View File

@ -6,7 +6,7 @@ from typing import Union
from xml.etree import ElementTree
from scraper.functions import Args, Func
from scraper.utils import str_to_etree, strip
from scraper.utils import deep_update, str_to_etree, strip
_logger = logging.getLogger(__name__)
@ -23,7 +23,7 @@ class CollectArgs(Args):
from_ = rawargs.get("from")
if from_ is not None:
self.source = _render(from_, self.source)
self.into = rawargs["into"]
self.into = self.substitute(rawargs["into"], context)
return self
@ -36,7 +36,7 @@ def collect(args: CollectArgs, context: dict) -> None:
if isinstance(target, list) and isinstance(result, list):
target.extend(result)
elif isinstance(target, dict) and isinstance(result, dict):
target.update(result)
deep_update(target, result)
else:
context[ctxkey] = result
_logger.info('Collected "%s" using "%s"', ctxkey, tmpl)
@ -52,6 +52,8 @@ def _render(tmpl: Union[list, dict, str], source: str, etree=None):
elif isinstance(tmpl, str):
if len(tmpl.strip()) == 0:
return ""
if ":" not in tmpl:
return tmpl
# split template string into strategy and expression
strategy, expr = [s.strip() for s in tmpl.split(":", 1)]
if strategy.startswith("x"):

View File

@ -18,9 +18,9 @@ _logger = logging.getLogger(__name__)
# define default HTTP cache configuration
_currentdir = os.path.dirname(os.path.realpath(__file__))
_basedir = os.path.dirname(os.path.realpath(__file__))
_cache_name = ".httpcache"
_cache_file = os.path.join(_currentdir, _cache_name)
_cache_file = os.path.join(_basedir, _cache_name)
_cache_expire = 86400
# define a global opener and install it to urllib.request
@ -41,7 +41,7 @@ class HttpArgs(Args):
result: str
def parse(self, rawargs: dict, context: dict) -> "HttpArgs":
self.url = self.substitute(rawargs["url"], context) # type: ignore
self.url = self.substitute(rawargs["url"], context)
self.method = rawargs["method"].upper()
self.headers = {
k.lower(): self.substitute(v, context)
@ -84,9 +84,9 @@ def _http_request(url, method, headers, body, timeout):
# check if the cache is expired
shelve_flag = "c" # creating database if not exist
for filename in os.listdir(_currentdir):
for filename in os.listdir(_basedir):
if filename.startswith(_cache_name):
shelve_file = os.path.join(_currentdir, filename)
shelve_file = os.path.join(_basedir, filename)
modify_time = os.path.getmtime(shelve_file)
if (time.time() - modify_time) > _cache_expire:
shelve_flag = "n" # always create a new, empty database

View File

@ -13,8 +13,8 @@ from scraper.functions import findfunc
_logger = logging.getLogger(__name__)
# define default scraping config file path
_currentdir = os.path.dirname(os.path.realpath(__file__))
_configpath = os.path.join(_currentdir, "../scrapeflows")
_basedir = os.path.dirname(os.path.realpath(__file__))
_configpath = os.path.join(_basedir, "../scrapeflows")
# define maximum number of results to return
_maxlimit = 10
@ -46,9 +46,9 @@ def scrape():
# parse --input argument as JSON
jsoninput = json.loads(args.input)
initialval = {
"title": jsoninput.get("title", None),
"title": jsoninput["title"],
"season": jsoninput.get("season", 0),
"episode": jsoninput.get("episode", None),
"episode": jsoninput.get("episode", 1),
"year": jsoninput.get("original_available", None),
"lang": args.lang,
"limit": maxlimit,

View File

@ -7,6 +7,16 @@ from xml.etree import ElementTree
from scraper.exceptions import ResultParseError
def deep_update(d: dict, u: dict):
"""Recursively update a dictionary."""
for k, v in u.items():
if k in d and isinstance(d[k], dict) and isinstance(v, dict):
d[k] = deep_update(d[k], v)
else:
d[k] = v
return d
def strip(result: Any):
"""Strip leading and trailing whitespace."""
if isinstance(result, list):