Support for TV show episode information scraping

2025-08-16 17:12:14 +00:00 · 2023-08-11 01:13:16 +08:00
parent ed168eea92
commit 98cb4b54b8
7 changed files with 136 additions and 17 deletions
--- a/scrapeflows/douban_movie.json
+++ b/scrapeflows/douban_movie.json
@ -45,8 +45,8 @@
                "movie": {
                  "title": "xtext: ./name",
                  "tagline": "",
-                  "certificate": "",
                  "original_available": "xtext: ./datePublished",
+                  "certificate": "",
                  "genre": "xtexts: ./genre/*",
                  "actor": "xtexts: ./actor//name",
                  "writer": "xtexts: ./author//name",
--- a/scrapeflows/douban_tvshow_episode.json
+++ b/scrapeflows/douban_tvshow_episode.json
@ -0,0 +1,109 @@
+{
+  "type": "tvshow_episode",
+  "site": "douban.com",
+  "steps": [
+    {
+      "http": {
+        "url": "https://www.douban.com/search?cat=1002&q={title}",
+        "method": "GET",
+        "headers": {
+          "Referer": "https://www.douban.com/",
+          "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
+        },
+        "result": "metadata"
+      }
+    },
+    {
+      "collect": {
+        "source": "metadata",
+        "into": {
+          "ids": "matches: 电视剧].*?sid:\\s*(\\d+)\\s*,"
+        }
+      }
+    },
+    {
+      "loop": {
+        "source": "ids",
+        "item": "id",
+        "steps": [
+          {
+            "http": {
+              "url": "https://movie.douban.com/subject/{id}/",
+              "method": "GET",
+              "headers": {
+                "Host": "movie.douban.com",
+                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
+              },
+              "result": "subject"
+            }
+          },
+          {
+            "collect": {
+              "source": "subject",
+              "from": "xtext: .//script[@type='application/ld+json']",
+              "into": {
+                "episode": {
+                  "title": "xtext: ./name",
+                  "tagline": "",
+                  "original_available": "xtext: ./datePublished",
+                  "certificate": "",
+                  "genre": "xtexts: ./genre/*",
+                  "actor": "xtexts: ./actor//name",
+                  "writer": "xtexts: ./author//name",
+                  "director": "xtexts: ./director//name",
+                  "season": "{$parent[season]}",
+                  "episode": "{$parent[episode]}",
+                  "extra": {
+                    "douban.com": {
+                      "tvshow": {
+                        "title": "xtext: ./name",
+                        "original_available": "xtext: ./datePublished",
+                        "extra": {
+                          "douban.com": {
+                            "poster": "xtexts: ./image",
+                            "backdrop": "xtexts: ./image"
+                          }
+                        }
+                      },
+                      "rating": {
+                        "douban.com": "xtext: .//ratingValue"
+                      },
+                      "poster": "xtexts: ./image"
+                    }
+                  }
+                }
+              }
+            }
+          },
+          {
+            "collect": {
+              "source": "subject",
+              "into": {
+                "episode": {
+                  "summary": "xtext: .//span[@property='v:summary']",
+                  "extra": {
+                    "douban.com": {
+                      "tvshow": {
+                        "summary": "xtext: .//span[@property='v:summary']"
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          },
+          {
+            "replace": {
+              "source": "episode",
+              "pattern": "(.+/photo)/s_ratio_poster/(public/.+)",
+              "replacement": "\\1/m/\\2"
+            }
+          },
+          {
+            "retval": "episode"
+          }
+        ]
+      }
+    }
+  ]
+}
--- a/scraper/functions/init.py
+++ b/scraper/functions/init.py
@ -6,7 +6,7 @@ import logging
 import pkgutil
 from abc import ABC, abstractmethod
 from functools import wraps
-from typing import Any, Callable, Optional, Type, Union
+from typing import Any, Callable, Type

 _logger = logging.getLogger(__name__)

@ -22,9 +22,7 @@ class Args(ABC):
        pass

    @staticmethod
-    def substitute(
-        obj: Optional[Union[str, list, dict]], context: dict
-    ) -> Optional[Union[str, list, dict]]:
+    def substitute(obj: Any, context: dict) -> Any:
        """Recursively substitute strings in an object with given context."""
        if isinstance(obj, str):
            return obj.format(**context)
--- a/scraper/functions/collect.py
+++ b/scraper/functions/collect.py
@ -6,7 +6,7 @@ from typing import Union
 from xml.etree import ElementTree

 from scraper.functions import Args, Func
-from scraper.utils import str_to_etree, strip
+from scraper.utils import deep_update, str_to_etree, strip

 _logger = logging.getLogger(__name__)

@ -23,7 +23,7 @@ class CollectArgs(Args):
        from_ = rawargs.get("from")
        if from_ is not None:
            self.source = _render(from_, self.source)
-        self.into = rawargs["into"]
+        self.into = self.substitute(rawargs["into"], context)
        return self


@ -36,7 +36,7 @@ def collect(args: CollectArgs, context: dict) -> None:
        if isinstance(target, list) and isinstance(result, list):
            target.extend(result)
        elif isinstance(target, dict) and isinstance(result, dict):
-            target.update(result)
+            deep_update(target, result)
        else:
            context[ctxkey] = result
        _logger.info('Collected "%s" using "%s"', ctxkey, tmpl)
@ -52,6 +52,8 @@ def _render(tmpl: Union[list, dict, str], source: str, etree=None):
    elif isinstance(tmpl, str):
        if len(tmpl.strip()) == 0:
            return ""
+        if ":" not in tmpl:
+            return tmpl
        # split template string into strategy and expression
        strategy, expr = [s.strip() for s in tmpl.split(":", 1)]
        if strategy.startswith("x"):
--- a/scraper/functions/request.py
+++ b/scraper/functions/request.py
@ -18,9 +18,9 @@ _logger = logging.getLogger(__name__)


 # define default HTTP cache configuration
-_currentdir = os.path.dirname(os.path.realpath(__file__))
+_basedir = os.path.dirname(os.path.realpath(__file__))
 _cache_name = ".httpcache"
-_cache_file = os.path.join(_currentdir, _cache_name)
+_cache_file = os.path.join(_basedir, _cache_name)
 _cache_expire = 86400

 # define a global opener and install it to urllib.request
@ -41,7 +41,7 @@ class HttpArgs(Args):
    result: str

    def parse(self, rawargs: dict, context: dict) -> "HttpArgs":
-        self.url = self.substitute(rawargs["url"], context)  # type: ignore
+        self.url = self.substitute(rawargs["url"], context)
        self.method = rawargs["method"].upper()
        self.headers = {
            k.lower(): self.substitute(v, context)
@ -84,9 +84,9 @@ def _http_request(url, method, headers, body, timeout):

    # check if the cache is expired
    shelve_flag = "c"  # creating database if not exist
-    for filename in os.listdir(_currentdir):
+    for filename in os.listdir(_basedir):
        if filename.startswith(_cache_name):
-            shelve_file = os.path.join(_currentdir, filename)
+            shelve_file = os.path.join(_basedir, filename)
            modify_time = os.path.getmtime(shelve_file)
            if (time.time() - modify_time) > _cache_expire:
                shelve_flag = "n"  # always create a new, empty database
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@ -13,8 +13,8 @@ from scraper.functions import findfunc
 _logger = logging.getLogger(__name__)

 # define default scraping config file path
-_currentdir = os.path.dirname(os.path.realpath(__file__))
-_configpath = os.path.join(_currentdir, "../scrapeflows")
+_basedir = os.path.dirname(os.path.realpath(__file__))
+_configpath = os.path.join(_basedir, "../scrapeflows")

 # define maximum number of results to return
 _maxlimit = 10
@ -46,9 +46,9 @@ def scrape():
    # parse --input argument as JSON
    jsoninput = json.loads(args.input)
    initialval = {
-        "title": jsoninput.get("title", None),
+        "title": jsoninput["title"],
        "season": jsoninput.get("season", 0),
-        "episode": jsoninput.get("episode", None),
+        "episode": jsoninput.get("episode", 1),
        "year": jsoninput.get("original_available", None),
        "lang": args.lang,
        "limit": maxlimit,
--- a/scraper/utils.py
+++ b/scraper/utils.py
@ -7,6 +7,16 @@ from xml.etree import ElementTree
 from scraper.exceptions import ResultParseError


+def deep_update(d: dict, u: dict):
+    """Recursively update a dictionary."""
+    for k, v in u.items():
+        if k in d and isinstance(d[k], dict) and isinstance(v, dict):
+            d[k] = deep_update(d[k], v)
+        else:
+            d[k] = v
+    return d
+
+
 def strip(result: Any):
    """Strip leading and trailing whitespace."""
    if isinstance(result, list):