From dd67799eb1c6e212f887f9ef819593549bb4dc17 Mon Sep 17 00:00:00 2001 From: Ross Hendry Date: Sat, 12 Nov 2022 16:07:56 +0000 Subject: [PATCH] calibre-web: dtrpg-metadata Add DMs Guild and fix cover downloads DriveThruRPG contains multiple stores, one of which is the DMs Guild for Dungeons and Dragons. Items that are only for sale on the Guild don't show up on DTRPG searches, and vice versa. A lot of the images end up being redirected to an eventual file, but Calibre-web doesn't follow redirects when saving the cover. Instead, we follow them up front and return the eventual URL. --- root/drivethrurpg.py | 283 ++++++++++++++++++++++++++----------------- 1 file changed, 173 insertions(+), 110 deletions(-) diff --git a/root/drivethrurpg.py b/root/drivethrurpg.py index 4bb7cea..ae25190 100644 --- a/root/drivethrurpg.py +++ b/root/drivethrurpg.py @@ -13,7 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from typing import Dict, List, Optional +from typing import Dict, List, Optional, cast from urllib.parse import quote from lxml import html import requests @@ -24,16 +24,132 @@ from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata log = logger.create() +HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"} + + +class DMSGuild(Metadata): + __name__ = "DMSGuild" + __id__ = "dmsguild" + DESCRIPTION = "DM's Guild" + META_URL = "https://www.dmsguild.com" + BASE_URL = f"{META_URL}/includes/ajax/search_autocomplete_jquery.php?term=" + QUERY_PARAMS = "&json=true" + HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"} + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if not self.active: + return None + + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = "%20".join(tokens) + + matches = _do_dtrpg_search( + query=f"{self.BASE_URL}{query}{self.QUERY_PARAMS}", + source=MetaSourceInfo( + id=self.__id__, + description=self.DESCRIPTION, + link=self.META_URL, + ), + ) + + return matches + class DriveThruRpg(Metadata): __name__ = "DriveThruRPG" __id__ = "drivethrurpg" DESCRIPTION = "DriveThru RPG" - META_URL = "https://www.drivethrurpg.com/" - BASE_URL = "https://www.drivethrurpg.com/includes/ajax/search_autocomplete_jquery.php?term=" + META_URL = "https://www.drivethrurpg.com" + BASE_URL = f"{META_URL}/includes/ajax/search_autocomplete_jquery.php?term=" QUERY_PARAMS = "&json=true" - HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"} + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: + if not self.active: + return None + + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = "%20".join(tokens) + + matches = _do_dtrpg_search( + query=f"{self.BASE_URL}{query}{self.QUERY_PARAMS}", + source=MetaSourceInfo( + id=self.__id__, + description=self.DESCRIPTION, + link=self.META_URL, + ), + ) + + return matches + + +def _do_dtrpg_search(query: str, source: MetaSourceInfo) -> List[MetaRecord]: + try: + log.info(f"Requesting data from: {query}") + result = requests.get( + query, + headers=HEADERS, + ) + result.raise_for_status() + except Exception as e: + log.warning(e) + return list() + + # If there are no hits we see a single element being returned with the easiest + # identifier being the link. + results_list: list = result.json() + if len(results_list) == 1 and results_list[0]["link"] == "#": + log.info("No results found") + return list() + + # Since we'll go on to do N further requests for more information, + # we'll cut it off at the first five results here. Any sufficiently well + # populated search by title should be enough + results: List[MetaRecord] = list() + for r in results_list[0:5]: + assert isinstance(r, dict) + match = _fetch_dtrpg_search_result(result=r, source=source) + + identifiers = {} + identifiers[source.id] = match.id + + match.identifiers = identifiers + + results.append(match) + + return results + + +def _fetch_dtrpg_search_result(result: Dict, source: MetaSourceInfo) -> MetaRecord: + match = MetaRecord( + id=result["name"], + title=result["name"], + authors=[], + url=result.get("link", ""), + source=source, + ) + + try: + details_result = requests.get( + result["link"], + headers=HEADERS, + ) + details_result.raise_for_status() + except Exception as e: + log.warning(e) + return match + + _parse_dtrpg_result(details_result.content, match) + + return match + + +def _parse_dtrpg_result(content: bytes, match: MetaRecord): AUTHORS_XPATH = "//div[@class='widget-information-wrapper']//div[@class='widget-information-item-title' and contains(text(), 'Author(s)')]" RULE_SYSTEMS_XPATH = "//div[@class='widget-information-wrapper']//div[@class='widget-information-item-title' and contains(text(), 'Rule System(s)')]" PUBLISHER_XPATH = "//div[@class='widget-information-wrapper-2']//div[@class='widget-information-title' and contains(text(), 'Publisher')]" @@ -41,118 +157,65 @@ class DriveThruRpg(Metadata): DESCRIPTION_XPATH = "//div[contains(@class,'prod-content')]//text()" IMAGE_PROP_XPATH = "//meta[@itemprop='image']/@content" - def search( - self, query: str, generic_cover: str = "", locale: str = "en" - ) -> Optional[List[MetaRecord]]: - val = list() - if self.active: - title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) - if title_tokens: - tokens = [quote(t.encode("utf-8")) for t in title_tokens] - query = "%20".join(tokens) + data = html.fromstring(content) - try: - result = requests.get( - f"{DriveThruRpg.BASE_URL}{query}{DriveThruRpg.QUERY_PARAMS}", - headers=DriveThruRpg.HEADERS, - ) - result.raise_for_status() - except Exception as e: - log.warning(e) - return None + # Use the big text field as description as the meta tag is very short + description_field = data.xpath(DESCRIPTION_XPATH) + assert isinstance(description_field, List) + if description_field is not None: + match.description = "".join(description_field).strip() # type: ignore - # Since we'll do on to do N further requests for more information, - # we'll cut it off at the first five results here. Any sufficiently well - # populated search by title should be enough - for r in result.json()[0:5]: - assert isinstance(r, dict) - match = self._parse_search_result( - result=r, generic_cover=generic_cover, locale=locale - ) - val.append(match) - return val + product_url = data.xpath(URL_PROP_XPATH) + assert isinstance(product_url, List) + if product_url is not None and len(product_url) > 0: + match.url = cast(str, product_url[0]) - def _parse_search_result( - self, result: Dict, generic_cover: str, locale: str - ) -> MetaRecord: - match = MetaRecord( - id=result["name"], - title=result["name"], - authors=[], - url=result.get("link", ""), - source=MetaSourceInfo( - id=self.__id__, - description=DriveThruRpg.DESCRIPTION, - link=DriveThruRpg.META_URL, - ), + # We can get a better ID from the URL + regex = r".*\/product\/(\d+)\/.*" + matches = re.findall(regex, match.url) + if len(matches) > 0: + match.id = matches[0] + + image_url = data.xpath(IMAGE_PROP_XPATH) + assert isinstance(image_url, List) + if image_url is not None and len(image_url) > 0: + # Calibre web doesn't follow redirects and reports some covers as an error + log.info(f"Cover URL is {image_url[0]}") + r = requests.head(image_url[0], allow_redirects=True) + log.info(f"After following redirects, it is {r.url}") + match.cover = cast(str, r.url) + + # Find authors + for div in cast(List, data.xpath(AUTHORS_XPATH)): + # Just bring in elements that look like they might be authors. + authors = list( + filter( + lambda x: re.match(r"^\w[\w\s]+$", x), + div.getnext().xpath(".//text()"), + ) ) + match.authors = authors - try: - details_result = requests.get( - result["link"], - headers=DriveThruRpg.HEADERS, + # Use rule systems as tags + match.tags = ["RPG"] + for div in cast(list, data.xpath(RULE_SYSTEMS_XPATH)): + rule_systems = list( + filter( + lambda x: len(x.strip()) > 0, + div.getnext().xpath(".//text()"), ) - details_result.raise_for_status() - except Exception as e: - log.warning(e) - return match + ) + match.tags.extend(rule_systems) - data = html.fromstring(details_result.content) - - # Use the big text field as description as the meta tag is very short - description_field = data.xpath(self.DESCRIPTION_XPATH) - if description_field is not None: - match.description = "".join(description_field).strip() - - product_url = data.xpath(self.URL_PROP_XPATH) - if product_url is not None and len(product_url) > 0: - match.url = product_url[0] - - # We can get a better ID from the URL - regex = r".*\/product\/(\d+)\/.*" - matches = re.findall(regex, match.url) - if len(matches) > 0: - match.id = matches[0] - - image_url = data.xpath(self.IMAGE_PROP_XPATH) - if image_url is not None and len(image_url) > 0: - match.cover = image_url[0] - - # Find authors - for div in data.xpath(self.AUTHORS_XPATH): - # Just bring in elements that look like they might be authors. - authors = list( - filter( - lambda x: re.match(r"^\w[\w\s]+$", x), - div.getnext().xpath(".//text()"), - ) + for div in cast(List, data.xpath(PUBLISHER_XPATH)): + publisher_link = div.getnext().xpath(".//a") + # Sometimes we get a link, other times it's text in a different element. + if publisher_link is not None and len(publisher_link) > 0: + match.publisher = publisher_link[0].text_content().strip() + else: + publisher_name = div.getnext().xpath( + ".//div[@class='widget-information-item-title']" ) - match.authors = authors + match.publisher = publisher_name[0].text_content().strip() - # Use rule systems as tags - match.tags = ["RPG"] - for div in data.xpath(self.RULE_SYSTEMS_XPATH): - rule_systems = list( - filter( - # lambda x: re.match(r"^\w[()\w\s]+$", x), - lambda x: len(x.strip()) > 0, - div.getnext().xpath(".//text()"), - ) - ) - match.tags.extend(rule_systems) - - for div in data.xpath(self.PUBLISHER_XPATH): - publisher_link = div.getnext().xpath(".//a") - # Sometimes we get a link, other times it's text in a different element. - if publisher_link is not None and len(publisher_link) > 0: - match.publisher = publisher_link[0].text_content().strip() - else: - publisher_name = div.getnext().xpath( - ".//div[@class='widget-information-item-title']" - ) - match.publisher = publisher_name[0].text_content().strip() - - # match.publishedDate = result.get("store_date", result.get("date_added")) - match.identifiers = {"drivethrurpg": match.id} - - return match + return match