Source code for signposting.htmllinks

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: 2022 The University of Manchester, UK
#
#   Copyright 2022 The University of Manchester, UK
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
"""
Parse HTML to find <link> elements for signposting.
"""

from typing import Union
import warnings
import requests
from bs4 import BeautifulSoup,SoupStrainer
from .signpost import SIGNPOSTING,AbsoluteURI,Signpost,Signposting

[docs]def find_signposting_html(uri:Union[AbsoluteURI, str]) -> Signposting: """Parse HTML to find ``<link>`` elements for signposting. HTTP redirects will be followed and any relative paths in links made absolute correspondingly. :param uri: An absolute http/https URI, which HTML will be inspected. :throws ValueError: If the `uri` is invalid :throws IOError: If the network request failed, e.g. connection timeout :throws requests.HTTPError: If the HTTP request failed, e.g. 404 Not Found :throws UnrecognizedContentType: If the HTTP resource was not a recognized HTML/XHTML content type :throws HTMLParser.HTMLParseError: If the HTML could not be parsed. :returns: A parsed :class:`Signposting` object (which may be empty) """ html = _get_html(AbsoluteURI(uri)) return _parse_html(html)
class DownloadedText(str): """Text downloaded from HTTP""" content_type: str """The returned Content-Type of the downloaded text""" requested_url: AbsoluteURI """The requested URL, before redirection""" resolved_url: AbsoluteURI """The resolved URL, after redirection.""" def __new__(cls, value:str, content_type:str, requested_url:AbsoluteURI, resolved_url:AbsoluteURI): # NOTE: Do not return value if it's already an DownloadedText # instance; it may differ in the other attributes or subclass s = super().__new__(cls, value) # NOTE: content_type is necessarily a signpost.MediaType, # as this string typically include charset, e.g. # "text/html; charset=iso-8859-1" s.content_type = content_type s.requested_url = requested_url s.resolved_url = resolved_url return s class HTML(DownloadedText): """Downloaded HTML document as string""" pass class XHTML(DownloadedText): """Downloaded XHTML document as a string""" pass class UnrecognizedContentType(Exception): def __init__(self, content_type:str, uri:AbsoluteURI): super().__init__("Unrecognized content-type %s for <%s>" % (content_type, uri)) self.content_type = content_type self.uri = uri def _get_html(uri:AbsoluteURI) -> Union[HTML,XHTML]: HEADERS = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9" } # Should ideally throw Not Acceptable error if none of the above page = requests.get(uri, headers=HEADERS) resolved_url = AbsoluteURI(page.url, uri) # Note: According to HTTP/1.1 updates (Appendix B) in # https://datatracker.ietf.org/doc/html/rfc7231 # then Content-Location should NO LONGER be used for # resolving relative URI references. ##if "Content-Location" in page.headers: ## # More specific, e.g. "index.en.html" - parse as relative URI reference ## resolved_url = AbsoluteURI(page.headers["Content-Location"], resolved_url) if page.status_code == 203: warnings.warn("203 Non-Authoritative Information <%s> - Signposting URIs may have been rewritten by proxy" % resolved_url) elif page.status_code == 410: warnings.warn( "410 Gone <%s> - still processing signposting for thumbstone page" % resolved_url) else: # raise requests.HTTPError for any other 4xx/5xx error page.raise_for_status() ct = page.headers.get("Content-Type", "") if "text/html" in ct: # page.text should get HTTP-level encoding correct, # but will not know about any charset declarations inside. return HTML(page.text, ct, uri, resolved_url) elif "application/xhtml+xml" in ct or "application/xml" in ct or "xhtml" in ct or "+xml" in ct: # Hopefully some XHTML inside. # These typically don't have charset parameter, the below # will guess by detection return XHTML(page.text, ct, uri, resolved_url) else: # HTTP server didn't honor our Accept header, and returned non-HTML. # It may be an image or something else that will crash our HTML parser, # so we'll bail out here. raise UnrecognizedContentType(ct, uri) def _parse_html(html:Union[HTML,XHTML]) -> Signposting: soup = BeautifulSoup(html, 'html.parser', # Ignore any other elements to reduce chance of parse errors parse_only=SoupStrainer(["head", "link"])) signposts = [] if soup.head: # In case <head> was missing for link in soup.head.find_all("link"): # Ensure all filters are in lower case and known url = link.get("href") if not url: warnings.warn("Invalid <link> element, missing href attribute: %s" % link) continue type = link.get("type") profiles = link.get("profile") rels = set(r.lower() for r in link.get("rel", []) if r.lower() in SIGNPOSTING) for rel in rels: try: signpost = Signpost(rel, url, type, profiles, html.resolved_url) except ValueError as e: warnings.warn("Ignoring invalid signpost from %s: %s" % (html.requested_url, e)) continue signposts.append(signpost) if not signposts: warnings.warn("No signposting found from <%s>" % html.requested_url) return Signposting(html.resolved_url, signposts)