mirror of
https://github.com/C5H12O5/syno-videoinfo-plugin.git
synced 2025-08-16 17:12:14 +00:00
Support for TV show episode information scraping
This commit is contained in:
@ -45,8 +45,8 @@
|
||||
"movie": {
|
||||
"title": "xtext: ./name",
|
||||
"tagline": "",
|
||||
"certificate": "",
|
||||
"original_available": "xtext: ./datePublished",
|
||||
"certificate": "",
|
||||
"genre": "xtexts: ./genre/*",
|
||||
"actor": "xtexts: ./actor//name",
|
||||
"writer": "xtexts: ./author//name",
|
||||
|
109
scrapeflows/douban_tvshow_episode.json
Normal file
109
scrapeflows/douban_tvshow_episode.json
Normal file
@ -0,0 +1,109 @@
|
||||
{
|
||||
"type": "tvshow_episode",
|
||||
"site": "douban.com",
|
||||
"steps": [
|
||||
{
|
||||
"http": {
|
||||
"url": "https://www.douban.com/search?cat=1002&q={title}",
|
||||
"method": "GET",
|
||||
"headers": {
|
||||
"Referer": "https://www.douban.com/",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
|
||||
},
|
||||
"result": "metadata"
|
||||
}
|
||||
},
|
||||
{
|
||||
"collect": {
|
||||
"source": "metadata",
|
||||
"into": {
|
||||
"ids": "matches: 电视剧].*?sid:\\s*(\\d+)\\s*,"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"loop": {
|
||||
"source": "ids",
|
||||
"item": "id",
|
||||
"steps": [
|
||||
{
|
||||
"http": {
|
||||
"url": "https://movie.douban.com/subject/{id}/",
|
||||
"method": "GET",
|
||||
"headers": {
|
||||
"Host": "movie.douban.com",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
|
||||
},
|
||||
"result": "subject"
|
||||
}
|
||||
},
|
||||
{
|
||||
"collect": {
|
||||
"source": "subject",
|
||||
"from": "xtext: .//script[@type='application/ld+json']",
|
||||
"into": {
|
||||
"episode": {
|
||||
"title": "xtext: ./name",
|
||||
"tagline": "",
|
||||
"original_available": "xtext: ./datePublished",
|
||||
"certificate": "",
|
||||
"genre": "xtexts: ./genre/*",
|
||||
"actor": "xtexts: ./actor//name",
|
||||
"writer": "xtexts: ./author//name",
|
||||
"director": "xtexts: ./director//name",
|
||||
"season": "{$parent[season]}",
|
||||
"episode": "{$parent[episode]}",
|
||||
"extra": {
|
||||
"douban.com": {
|
||||
"tvshow": {
|
||||
"title": "xtext: ./name",
|
||||
"original_available": "xtext: ./datePublished",
|
||||
"extra": {
|
||||
"douban.com": {
|
||||
"poster": "xtexts: ./image",
|
||||
"backdrop": "xtexts: ./image"
|
||||
}
|
||||
}
|
||||
},
|
||||
"rating": {
|
||||
"douban.com": "xtext: .//ratingValue"
|
||||
},
|
||||
"poster": "xtexts: ./image"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"collect": {
|
||||
"source": "subject",
|
||||
"into": {
|
||||
"episode": {
|
||||
"summary": "xtext: .//span[@property='v:summary']",
|
||||
"extra": {
|
||||
"douban.com": {
|
||||
"tvshow": {
|
||||
"summary": "xtext: .//span[@property='v:summary']"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"replace": {
|
||||
"source": "episode",
|
||||
"pattern": "(.+/photo)/s_ratio_poster/(public/.+)",
|
||||
"replacement": "\\1/m/\\2"
|
||||
}
|
||||
},
|
||||
{
|
||||
"retval": "episode"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
@ -6,7 +6,7 @@ import logging
|
||||
import pkgutil
|
||||
from abc import ABC, abstractmethod
|
||||
from functools import wraps
|
||||
from typing import Any, Callable, Optional, Type, Union
|
||||
from typing import Any, Callable, Type
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
@ -22,9 +22,7 @@ class Args(ABC):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def substitute(
|
||||
obj: Optional[Union[str, list, dict]], context: dict
|
||||
) -> Optional[Union[str, list, dict]]:
|
||||
def substitute(obj: Any, context: dict) -> Any:
|
||||
"""Recursively substitute strings in an object with given context."""
|
||||
if isinstance(obj, str):
|
||||
return obj.format(**context)
|
||||
|
@ -6,7 +6,7 @@ from typing import Union
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from scraper.functions import Args, Func
|
||||
from scraper.utils import str_to_etree, strip
|
||||
from scraper.utils import deep_update, str_to_etree, strip
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
@ -23,7 +23,7 @@ class CollectArgs(Args):
|
||||
from_ = rawargs.get("from")
|
||||
if from_ is not None:
|
||||
self.source = _render(from_, self.source)
|
||||
self.into = rawargs["into"]
|
||||
self.into = self.substitute(rawargs["into"], context)
|
||||
return self
|
||||
|
||||
|
||||
@ -36,7 +36,7 @@ def collect(args: CollectArgs, context: dict) -> None:
|
||||
if isinstance(target, list) and isinstance(result, list):
|
||||
target.extend(result)
|
||||
elif isinstance(target, dict) and isinstance(result, dict):
|
||||
target.update(result)
|
||||
deep_update(target, result)
|
||||
else:
|
||||
context[ctxkey] = result
|
||||
_logger.info('Collected "%s" using "%s"', ctxkey, tmpl)
|
||||
@ -52,6 +52,8 @@ def _render(tmpl: Union[list, dict, str], source: str, etree=None):
|
||||
elif isinstance(tmpl, str):
|
||||
if len(tmpl.strip()) == 0:
|
||||
return ""
|
||||
if ":" not in tmpl:
|
||||
return tmpl
|
||||
# split template string into strategy and expression
|
||||
strategy, expr = [s.strip() for s in tmpl.split(":", 1)]
|
||||
if strategy.startswith("x"):
|
||||
|
@ -18,9 +18,9 @@ _logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# define default HTTP cache configuration
|
||||
_currentdir = os.path.dirname(os.path.realpath(__file__))
|
||||
_basedir = os.path.dirname(os.path.realpath(__file__))
|
||||
_cache_name = ".httpcache"
|
||||
_cache_file = os.path.join(_currentdir, _cache_name)
|
||||
_cache_file = os.path.join(_basedir, _cache_name)
|
||||
_cache_expire = 86400
|
||||
|
||||
# define a global opener and install it to urllib.request
|
||||
@ -41,7 +41,7 @@ class HttpArgs(Args):
|
||||
result: str
|
||||
|
||||
def parse(self, rawargs: dict, context: dict) -> "HttpArgs":
|
||||
self.url = self.substitute(rawargs["url"], context) # type: ignore
|
||||
self.url = self.substitute(rawargs["url"], context)
|
||||
self.method = rawargs["method"].upper()
|
||||
self.headers = {
|
||||
k.lower(): self.substitute(v, context)
|
||||
@ -84,9 +84,9 @@ def _http_request(url, method, headers, body, timeout):
|
||||
|
||||
# check if the cache is expired
|
||||
shelve_flag = "c" # creating database if not exist
|
||||
for filename in os.listdir(_currentdir):
|
||||
for filename in os.listdir(_basedir):
|
||||
if filename.startswith(_cache_name):
|
||||
shelve_file = os.path.join(_currentdir, filename)
|
||||
shelve_file = os.path.join(_basedir, filename)
|
||||
modify_time = os.path.getmtime(shelve_file)
|
||||
if (time.time() - modify_time) > _cache_expire:
|
||||
shelve_flag = "n" # always create a new, empty database
|
||||
|
@ -13,8 +13,8 @@ from scraper.functions import findfunc
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
# define default scraping config file path
|
||||
_currentdir = os.path.dirname(os.path.realpath(__file__))
|
||||
_configpath = os.path.join(_currentdir, "../scrapeflows")
|
||||
_basedir = os.path.dirname(os.path.realpath(__file__))
|
||||
_configpath = os.path.join(_basedir, "../scrapeflows")
|
||||
|
||||
# define maximum number of results to return
|
||||
_maxlimit = 10
|
||||
@ -46,9 +46,9 @@ def scrape():
|
||||
# parse --input argument as JSON
|
||||
jsoninput = json.loads(args.input)
|
||||
initialval = {
|
||||
"title": jsoninput.get("title", None),
|
||||
"title": jsoninput["title"],
|
||||
"season": jsoninput.get("season", 0),
|
||||
"episode": jsoninput.get("episode", None),
|
||||
"episode": jsoninput.get("episode", 1),
|
||||
"year": jsoninput.get("original_available", None),
|
||||
"lang": args.lang,
|
||||
"limit": maxlimit,
|
||||
|
@ -7,6 +7,16 @@ from xml.etree import ElementTree
|
||||
from scraper.exceptions import ResultParseError
|
||||
|
||||
|
||||
def deep_update(d: dict, u: dict):
|
||||
"""Recursively update a dictionary."""
|
||||
for k, v in u.items():
|
||||
if k in d and isinstance(d[k], dict) and isinstance(v, dict):
|
||||
d[k] = deep_update(d[k], v)
|
||||
else:
|
||||
d[k] = v
|
||||
return d
|
||||
|
||||
|
||||
def strip(result: Any):
|
||||
"""Strip leading and trailing whitespace."""
|
||||
if isinstance(result, list):
|
||||
|
Reference in New Issue
Block a user