Source code for signposting.htmllinks

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: 2022 The University of Manchester, UK
#
#   Copyright 2022 The University of Manchester, UK
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
"""
Parse HTML to find <link> elements for signposting.
"""

from typing import Union
import warnings
import requests
from bs4 import BeautifulSoup,SoupStrainer
from .signpost import SIGNPOSTING,AbsoluteURI,Signpost,Signposting

[docs]def find_signposting_html(uri:Union[AbsoluteURI, str]) -> Signposting:
    """Parse HTML to find ``<link>`` elements for signposting.
    
    HTTP redirects will be followed and any relative paths in links
    made absolute correspondingly.

    :param uri: An absolute http/https URI, which HTML will be inspected.
    :throws ValueError: If the `uri` is invalid
    :throws IOError: If the network request failed, e.g. connection timeout
    :throws requests.HTTPError: If the HTTP request failed, e.g. 404 Not Found
    :throws UnrecognizedContentType: If the HTTP resource was not a recognized HTML/XHTML content type
    :throws HTMLParser.HTMLParseError: If the HTML could not be parsed.
    :returns: A parsed :class:`Signposting` object (which may be empty)
    """
    html = _get_html(AbsoluteURI(uri))
    return _parse_html(html)

class DownloadedText(str):
    """Text downloaded from HTTP"""

    content_type: str
    """The returned Content-Type of the downloaded text"""

    requested_url: AbsoluteURI
    """The requested URL, before redirection"""

    resolved_url: AbsoluteURI
    """The resolved URL, after redirection."""

    def __new__(cls, value:str, content_type:str, requested_url:AbsoluteURI, resolved_url:AbsoluteURI):
        # NOTE: Do not return value if it's already an DownloadedText
        # instance; it may differ in the other attributes or subclass
        s = super().__new__(cls, value)
        # NOTE: content_type is necessarily a signpost.MediaType, 
        # as this string typically include charset, e.g. 
        # "text/html; charset=iso-8859-1"
        s.content_type = content_type
        s.requested_url = requested_url
        s.resolved_url = resolved_url
        return s

class HTML(DownloadedText):
    """Downloaded HTML document as string"""
    pass

class XHTML(DownloadedText):
    """Downloaded XHTML document as a string"""
    pass

class UnrecognizedContentType(Exception):
    def __init__(self, content_type:str, uri:AbsoluteURI):
        super().__init__("Unrecognized content-type %s for <%s>" % (content_type, uri))
        self.content_type = content_type
        self.uri = uri

def _get_html(uri:AbsoluteURI) -> Union[HTML,XHTML]:
    HEADERS = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9"
    }
    # Should ideally throw Not Acceptable error if none of the above
    page = requests.get(uri, headers=HEADERS)

    resolved_url = AbsoluteURI(page.url, uri)
    
    # Note: According to HTTP/1.1 updates (Appendix B) in 
    # https://datatracker.ietf.org/doc/html/rfc7231
    # then Content-Location should NO LONGER be used for 
    # resolving relative URI references.
    ##if "Content-Location" in page.headers:
    ##    # More specific, e.g. "index.en.html" - parse as relative URI reference
    ##    resolved_url = AbsoluteURI(page.headers["Content-Location"], resolved_url)

    if page.status_code == 203:
        warnings.warn("203 Non-Authoritative Information <%s> - Signposting URIs may have been rewritten by proxy" %
                    resolved_url)
    elif page.status_code == 410:
            warnings.warn(
                "410 Gone <%s> - still processing signposting for thumbstone page" % resolved_url)
    else:
        # raise requests.HTTPError for any other 4xx/5xx error
        page.raise_for_status()
    
    ct = page.headers.get("Content-Type", "")
    if "text/html" in ct:
        # page.text should get HTTP-level encoding correct,
        # but will not know about any charset declarations inside.
        return HTML(page.text, ct, uri, resolved_url)
    elif "application/xhtml+xml" in ct or "application/xml" in ct or "xhtml" in ct or "+xml" in ct:
        # Hopefully some XHTML inside.
        # These typically don't have charset parameter, the below
        # will guess by detection
        return XHTML(page.text, ct, uri, resolved_url)
    else:
        # HTTP server didn't honor our Accept header, and returned non-HTML.
        # It may be an image or something else that will crash our HTML parser,
        # so we'll bail out here.
        raise UnrecognizedContentType(ct, uri)

def _parse_html(html:Union[HTML,XHTML]) -> Signposting:
    soup = BeautifulSoup(html, 'html.parser', 
        # Ignore any other elements to reduce chance of parse errors
        parse_only=SoupStrainer(["head", "link"]))
    signposts = []
    if soup.head: # In case <head> was missing
        for link in soup.head.find_all("link"):
            # Ensure all filters are in lower case and known
            url = link.get("href")
            if not url:
                warnings.warn("Invalid <link> element, missing href attribute: %s" % link)
                continue
            type = link.get("type")
            profiles = link.get("profile")
            rels = set(r.lower() for r in link.get("rel", [])
                        if r.lower() in SIGNPOSTING)
            for rel in rels:
                try:
                    signpost = Signpost(rel, url, type, profiles, html.resolved_url)
                except ValueError as e:
                    warnings.warn("Ignoring invalid signpost from %s: %s" % (html.requested_url, e))
                    continue
                signposts.append(signpost)
    if not signposts:
        warnings.warn("No signposting found from <%s>" % html.requested_url)
    return Signposting(html.resolved_url, signposts)
Navigation

Source code for signposting.htmllinks

Navigation