Files
syno-videoinfo-plugin/scraper/scraper.py
2023-08-16 19:29:03 +08:00

143 lines
4.7 KiB
Python

"""Entry point for the scraper."""
import argparse
import json
import logging
import threading
import time
from pathlib import Path
from typing import Any, List
from scraper.exceptions import ScrapeError
from scraper.functions import findfunc
_logger = logging.getLogger(__name__)
# define default scraping config file path
_basedir = Path(__file__).resolve().parent
_configpath = _basedir / "../scrapeflows"
# define maximum number of results to return
_maxlimit = 10
_results: List[Any] = []
def scrape(plugin_id: str) -> str:
"""Scrape video information from given arguments."""
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, required=True)
parser.add_argument("--type", type=str, required=True)
parser.add_argument("--lang", type=str, required=False)
parser.add_argument("--limit", type=int, default=_maxlimit)
parser.add_argument("--allowguess", action="store_true", default=False)
parser.add_argument("--configpath", type=str, default=str(_configpath))
parser.add_argument("--loglevel", type=str, default="critical")
args = parser.parse_known_args()[0]
maxlimit = min(args.limit, _maxlimit)
# set basic logging configuration
loglevel = getattr(logging, args.loglevel.upper())
logformat = (
"%(asctime)s %(threadName)s %(levelname)s "
"%(filename)s:%(lineno)d - %(message)s"
)
logging.basicConfig(level=loglevel, format=logformat)
# parse --input argument as JSON
jsoninput = json.loads(args.input)
initialval = {
"title": jsoninput["title"],
"season": jsoninput.get("season", 0),
"episode": jsoninput.get("episode", 1),
"year": jsoninput.get("original_available", None),
"lang": args.lang,
"limit": maxlimit,
"allowguess": args.allowguess,
}
# load and execute scrape flows using multithreading
start = time.time()
tasks = []
for flow in ScrapeFlow.load(args.configpath, args.type, initialval):
task = threading.Thread(target=_start, args=(flow, maxlimit))
tasks.append(task)
task.start()
for task in tasks:
task.join()
end = time.time()
_logger.info("Total execution time: %.3f seconds", end - start)
return json.dumps(
{"success": True, "result": _results}, ensure_ascii=False, indent=2
).replace("[plugin_id]", plugin_id)
def _start(flow: "ScrapeFlow", limit: int):
"""Start a scrape flow and store results."""
try:
result_gen = flow.start()
while True:
if len(_results) >= limit:
break
try:
_results.append(next(result_gen))
except StopIteration:
break
except ScrapeError:
_logger.error("Failed to scrape from %s", flow.site, exc_info=True)
class ScrapeFlow:
"""A flow of steps to scrape video information."""
def __init__(self, site: str, steps: List[dict], context: dict):
self.site = site
self.steps = steps
self.context = context
def start(self):
"""Start the scrape flow and return a generator."""
for funcname, rawargs in [s.popitem() for s in self.steps]:
# execute the function with context
iterable = findfunc(funcname)(rawargs, self.context)
if iterable is not None:
yield from iterable
@staticmethod
def load(path: str, videotype: str, initialval: dict):
"""Load scrape flows from given path."""
saved_conf = None
conf_path = _basedir / "../scrapeflows.conf"
if conf_path.exists():
with open(conf_path, "r", encoding="utf-8") as reader:
saved_conf = json.load(reader)
for filepath in Path(path).glob("*.json"):
with open(filepath, "r", encoding="utf-8") as flowdef_json:
flowdef = json.load(flowdef_json)
# filter out flows that do not match the video type
if not ScrapeFlow.valid(flowdef, videotype, saved_conf):
continue
# generate a flow instance from the definition
site = flowdef["site"]
steps = list(flowdef["steps"])
context = initialval.copy()
context["site"] = site
yield ScrapeFlow(site, steps, context)
@staticmethod
def valid(flowdef: Any, videotype: str, conf: Any):
"""Check if the flow definition is valid."""
if flowdef["type"] != videotype:
return False
site = flowdef["site"]
if conf is not None and site in conf:
site_conf = conf[site]
types = site_conf["types"]
if not any(videotype.startswith(t) for t in types):
return False
return True