diff --git a/root/drivethrurpg.py b/root/drivethrurpg.py index 4bb7cea..ae25190 100644 --- a/root/drivethrurpg.py +++ b/root/drivethrurpg.py @@ -13,7 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from typing import Dict, List, Optional +from typing import Dict, List, Optional, cast from urllib.parse import quote from lxml import html import requests @@ -24,16 +24,132 @@ from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata log = logger.create() +HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"} + + +class DMSGuild(Metadata): + __name__ = "DMSGuild" + __id__ = "dmsguild" + DESCRIPTION = "DM's Guild" + META_URL = "https://www.dmsguild.com" + BASE_URL = f"{META_URL}/includes/ajax/search_autocomplete_jquery.php?term=" + QUERY_PARAMS = "&json=true" + HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"} + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if not self.active: + return None + + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = "%20".join(tokens) + + matches = _do_dtrpg_search( + query=f"{self.BASE_URL}{query}{self.QUERY_PARAMS}", + source=MetaSourceInfo( + id=self.__id__, + description=self.DESCRIPTION, + link=self.META_URL, + ), + ) + + return matches + class DriveThruRpg(Metadata): __name__ = "DriveThruRPG" __id__ = "drivethrurpg" DESCRIPTION = "DriveThru RPG" - META_URL = "https://www.drivethrurpg.com/" - BASE_URL = "https://www.drivethrurpg.com/includes/ajax/search_autocomplete_jquery.php?term=" + META_URL = "https://www.drivethrurpg.com" + BASE_URL = f"{META_URL}/includes/ajax/search_autocomplete_jquery.php?term=" QUERY_PARAMS = "&json=true" - HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"} + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: + if not self.active: + return None + + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = "%20".join(tokens) + + matches = _do_dtrpg_search( + query=f"{self.BASE_URL}{query}{self.QUERY_PARAMS}", + source=MetaSourceInfo( + id=self.__id__, + description=self.DESCRIPTION, + link=self.META_URL, + ), + ) + + return matches + + +def _do_dtrpg_search(query: str, source: MetaSourceInfo) -> List[MetaRecord]: + try: + log.info(f"Requesting data from: {query}") + result = requests.get( + query, + headers=HEADERS, + ) + result.raise_for_status() + except Exception as e: + log.warning(e) + return list() + + # If there are no hits we see a single element being returned with the easiest + # identifier being the link. + results_list: list = result.json() + if len(results_list) == 1 and results_list[0]["link"] == "#": + log.info("No results found") + return list() + + # Since we'll go on to do N further requests for more information, + # we'll cut it off at the first five results here. Any sufficiently well + # populated search by title should be enough + results: List[MetaRecord] = list() + for r in results_list[0:5]: + assert isinstance(r, dict) + match = _fetch_dtrpg_search_result(result=r, source=source) + + identifiers = {} + identifiers[source.id] = match.id + + match.identifiers = identifiers + + results.append(match) + + return results + + +def _fetch_dtrpg_search_result(result: Dict, source: MetaSourceInfo) -> MetaRecord: + match = MetaRecord( + id=result["name"], + title=result["name"], + authors=[], + url=result.get("link", ""), + source=source, + ) + + try: + details_result = requests.get( + result["link"], + headers=HEADERS, + ) + details_result.raise_for_status() + except Exception as e: + log.warning(e) + return match + + _parse_dtrpg_result(details_result.content, match) + + return match + + +def _parse_dtrpg_result(content: bytes, match: MetaRecord): AUTHORS_XPATH = "//div[@class='widget-information-wrapper']//div[@class='widget-information-item-title' and contains(text(), 'Author(s)')]" RULE_SYSTEMS_XPATH = "//div[@class='widget-information-wrapper']//div[@class='widget-information-item-title' and contains(text(), 'Rule System(s)')]" PUBLISHER_XPATH = "//div[@class='widget-information-wrapper-2']//div[@class='widget-information-title' and contains(text(), 'Publisher')]" @@ -41,118 +157,65 @@ class DriveThruRpg(Metadata): DESCRIPTION_XPATH = "//div[contains(@class,'prod-content')]//text()" IMAGE_PROP_XPATH = "//meta[@itemprop='image']/@content" - def search( - self, query: str, generic_cover: str = "", locale: str = "en" - ) -> Optional[List[MetaRecord]]: - val = list() - if self.active: - title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) - if title_tokens: - tokens = [quote(t.encode("utf-8")) for t in title_tokens] - query = "%20".join(tokens) + data = html.fromstring(content) - try: - result = requests.get( - f"{DriveThruRpg.BASE_URL}{query}{DriveThruRpg.QUERY_PARAMS}", - headers=DriveThruRpg.HEADERS, - ) - result.raise_for_status() - except Exception as e: - log.warning(e) - return None + # Use the big text field as description as the meta tag is very short + description_field = data.xpath(DESCRIPTION_XPATH) + assert isinstance(description_field, List) + if description_field is not None: + match.description = "".join(description_field).strip() # type: ignore - # Since we'll do on to do N further requests for more information, - # we'll cut it off at the first five results here. Any sufficiently well - # populated search by title should be enough - for r in result.json()[0:5]: - assert isinstance(r, dict) - match = self._parse_search_result( - result=r, generic_cover=generic_cover, locale=locale - ) - val.append(match) - return val + product_url = data.xpath(URL_PROP_XPATH) + assert isinstance(product_url, List) + if product_url is not None and len(product_url) > 0: + match.url = cast(str, product_url[0]) - def _parse_search_result( - self, result: Dict, generic_cover: str, locale: str - ) -> MetaRecord: - match = MetaRecord( - id=result["name"], - title=result["name"], - authors=[], - url=result.get("link", ""), - source=MetaSourceInfo( - id=self.__id__, - description=DriveThruRpg.DESCRIPTION, - link=DriveThruRpg.META_URL, - ), + # We can get a better ID from the URL + regex = r".*\/product\/(\d+)\/.*" + matches = re.findall(regex, match.url) + if len(matches) > 0: + match.id = matches[0] + + image_url = data.xpath(IMAGE_PROP_XPATH) + assert isinstance(image_url, List) + if image_url is not None and len(image_url) > 0: + # Calibre web doesn't follow redirects and reports some covers as an error + log.info(f"Cover URL is {image_url[0]}") + r = requests.head(image_url[0], allow_redirects=True) + log.info(f"After following redirects, it is {r.url}") + match.cover = cast(str, r.url) + + # Find authors + for div in cast(List, data.xpath(AUTHORS_XPATH)): + # Just bring in elements that look like they might be authors. + authors = list( + filter( + lambda x: re.match(r"^\w[\w\s]+$", x), + div.getnext().xpath(".//text()"), + ) ) + match.authors = authors - try: - details_result = requests.get( - result["link"], - headers=DriveThruRpg.HEADERS, + # Use rule systems as tags + match.tags = ["RPG"] + for div in cast(list, data.xpath(RULE_SYSTEMS_XPATH)): + rule_systems = list( + filter( + lambda x: len(x.strip()) > 0, + div.getnext().xpath(".//text()"), ) - details_result.raise_for_status() - except Exception as e: - log.warning(e) - return match + ) + match.tags.extend(rule_systems) - data = html.fromstring(details_result.content) - - # Use the big text field as description as the meta tag is very short - description_field = data.xpath(self.DESCRIPTION_XPATH) - if description_field is not None: - match.description = "".join(description_field).strip() - - product_url = data.xpath(self.URL_PROP_XPATH) - if product_url is not None and len(product_url) > 0: - match.url = product_url[0] - - # We can get a better ID from the URL - regex = r".*\/product\/(\d+)\/.*" - matches = re.findall(regex, match.url) - if len(matches) > 0: - match.id = matches[0] - - image_url = data.xpath(self.IMAGE_PROP_XPATH) - if image_url is not None and len(image_url) > 0: - match.cover = image_url[0] - - # Find authors - for div in data.xpath(self.AUTHORS_XPATH): - # Just bring in elements that look like they might be authors. - authors = list( - filter( - lambda x: re.match(r"^\w[\w\s]+$", x), - div.getnext().xpath(".//text()"), - ) + for div in cast(List, data.xpath(PUBLISHER_XPATH)): + publisher_link = div.getnext().xpath(".//a") + # Sometimes we get a link, other times it's text in a different element. + if publisher_link is not None and len(publisher_link) > 0: + match.publisher = publisher_link[0].text_content().strip() + else: + publisher_name = div.getnext().xpath( + ".//div[@class='widget-information-item-title']" ) - match.authors = authors + match.publisher = publisher_name[0].text_content().strip() - # Use rule systems as tags - match.tags = ["RPG"] - for div in data.xpath(self.RULE_SYSTEMS_XPATH): - rule_systems = list( - filter( - # lambda x: re.match(r"^\w[()\w\s]+$", x), - lambda x: len(x.strip()) > 0, - div.getnext().xpath(".//text()"), - ) - ) - match.tags.extend(rule_systems) - - for div in data.xpath(self.PUBLISHER_XPATH): - publisher_link = div.getnext().xpath(".//a") - # Sometimes we get a link, other times it's text in a different element. - if publisher_link is not None and len(publisher_link) > 0: - match.publisher = publisher_link[0].text_content().strip() - else: - publisher_name = div.getnext().xpath( - ".//div[@class='widget-information-item-title']" - ) - match.publisher = publisher_name[0].text_content().strip() - - # match.publishedDate = result.get("store_date", result.get("date_added")) - match.identifiers = {"drivethrurpg": match.id} - - return match + return match