docker-mods/root/drivethrurpg.py
Ross Hendry dd67799eb1 calibre-web: dtrpg-metadata Add DMs Guild and fix cover downloads
DriveThruRPG contains multiple stores, one of which is the DMs Guild
for Dungeons and Dragons. Items that are only for sale on the Guild
don't show up on DTRPG searches, and vice versa.

A lot of the images end up being redirected to an eventual file, but
Calibre-web doesn't follow redirects when saving the cover. Instead, we
follow them up front and return the eventual URL.
2022-11-13 14:35:31 +00:00

222 lines
7.6 KiB
Python

# -*- coding: utf-8 -*-
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from typing import Dict, List, Optional, cast
from urllib.parse import quote
from lxml import html
import requests
import re
from cps import logger
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
log = logger.create()
HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"}
class DMSGuild(Metadata):
__name__ = "DMSGuild"
__id__ = "dmsguild"
DESCRIPTION = "DM's Guild"
META_URL = "https://www.dmsguild.com"
BASE_URL = f"{META_URL}/includes/ajax/search_autocomplete_jquery.php?term="
QUERY_PARAMS = "&json=true"
HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"}
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if not self.active:
return None
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "%20".join(tokens)
matches = _do_dtrpg_search(
query=f"{self.BASE_URL}{query}{self.QUERY_PARAMS}",
source=MetaSourceInfo(
id=self.__id__,
description=self.DESCRIPTION,
link=self.META_URL,
),
)
return matches
class DriveThruRpg(Metadata):
__name__ = "DriveThruRPG"
__id__ = "drivethrurpg"
DESCRIPTION = "DriveThru RPG"
META_URL = "https://www.drivethrurpg.com"
BASE_URL = f"{META_URL}/includes/ajax/search_autocomplete_jquery.php?term="
QUERY_PARAMS = "&json=true"
def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
if not self.active:
return None
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "%20".join(tokens)
matches = _do_dtrpg_search(
query=f"{self.BASE_URL}{query}{self.QUERY_PARAMS}",
source=MetaSourceInfo(
id=self.__id__,
description=self.DESCRIPTION,
link=self.META_URL,
),
)
return matches
def _do_dtrpg_search(query: str, source: MetaSourceInfo) -> List[MetaRecord]:
try:
log.info(f"Requesting data from: {query}")
result = requests.get(
query,
headers=HEADERS,
)
result.raise_for_status()
except Exception as e:
log.warning(e)
return list()
# If there are no hits we see a single element being returned with the easiest
# identifier being the link.
results_list: list = result.json()
if len(results_list) == 1 and results_list[0]["link"] == "#":
log.info("No results found")
return list()
# Since we'll go on to do N further requests for more information,
# we'll cut it off at the first five results here. Any sufficiently well
# populated search by title should be enough
results: List[MetaRecord] = list()
for r in results_list[0:5]:
assert isinstance(r, dict)
match = _fetch_dtrpg_search_result(result=r, source=source)
identifiers = {}
identifiers[source.id] = match.id
match.identifiers = identifiers
results.append(match)
return results
def _fetch_dtrpg_search_result(result: Dict, source: MetaSourceInfo) -> MetaRecord:
match = MetaRecord(
id=result["name"],
title=result["name"],
authors=[],
url=result.get("link", ""),
source=source,
)
try:
details_result = requests.get(
result["link"],
headers=HEADERS,
)
details_result.raise_for_status()
except Exception as e:
log.warning(e)
return match
_parse_dtrpg_result(details_result.content, match)
return match
def _parse_dtrpg_result(content: bytes, match: MetaRecord):
AUTHORS_XPATH = "//div[@class='widget-information-wrapper']//div[@class='widget-information-item-title' and contains(text(), 'Author(s)')]"
RULE_SYSTEMS_XPATH = "//div[@class='widget-information-wrapper']//div[@class='widget-information-item-title' and contains(text(), 'Rule System(s)')]"
PUBLISHER_XPATH = "//div[@class='widget-information-wrapper-2']//div[@class='widget-information-title' and contains(text(), 'Publisher')]"
URL_PROP_XPATH = "//meta[@itemprop='url']/@content"
DESCRIPTION_XPATH = "//div[contains(@class,'prod-content')]//text()"
IMAGE_PROP_XPATH = "//meta[@itemprop='image']/@content"
data = html.fromstring(content)
# Use the big text field as description as the meta tag is very short
description_field = data.xpath(DESCRIPTION_XPATH)
assert isinstance(description_field, List)
if description_field is not None:
match.description = "".join(description_field).strip() # type: ignore
product_url = data.xpath(URL_PROP_XPATH)
assert isinstance(product_url, List)
if product_url is not None and len(product_url) > 0:
match.url = cast(str, product_url[0])
# We can get a better ID from the URL
regex = r".*\/product\/(\d+)\/.*"
matches = re.findall(regex, match.url)
if len(matches) > 0:
match.id = matches[0]
image_url = data.xpath(IMAGE_PROP_XPATH)
assert isinstance(image_url, List)
if image_url is not None and len(image_url) > 0:
# Calibre web doesn't follow redirects and reports some covers as an error
log.info(f"Cover URL is {image_url[0]}")
r = requests.head(image_url[0], allow_redirects=True)
log.info(f"After following redirects, it is {r.url}")
match.cover = cast(str, r.url)
# Find authors
for div in cast(List, data.xpath(AUTHORS_XPATH)):
# Just bring in elements that look like they might be authors.
authors = list(
filter(
lambda x: re.match(r"^\w[\w\s]+$", x),
div.getnext().xpath(".//text()"),
)
)
match.authors = authors
# Use rule systems as tags
match.tags = ["RPG"]
for div in cast(list, data.xpath(RULE_SYSTEMS_XPATH)):
rule_systems = list(
filter(
lambda x: len(x.strip()) > 0,
div.getnext().xpath(".//text()"),
)
)
match.tags.extend(rule_systems)
for div in cast(List, data.xpath(PUBLISHER_XPATH)):
publisher_link = div.getnext().xpath(".//a")
# Sometimes we get a link, other times it's text in a different element.
if publisher_link is not None and len(publisher_link) > 0:
match.publisher = publisher_link[0].text_content().strip()
else:
publisher_name = div.getnext().xpath(
".//div[@class='widget-information-item-title']"
)
match.publisher = publisher_name[0].text_content().strip()
return match