Source code for signposting.linkset

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: 2022 The University of Manchester, UK
#
#   Copyright 2022 The University of Manchester, UK
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

"""
Parse linkset documents (`RFC9264`_) for signposting.

.. _RFC9264: https://www.rfc-editor.org/rfc/rfc9264.html
"""

from typing import List, Union
import warnings
import requests
import json
from .signpost import SIGNPOSTING,AbsoluteURI,Signpost,Signposting,MediaType
from .htmllinks import DownloadedText,UnrecognizedContentType
from .linkheader import find_signposting_http_link

[docs]def find_signposting_linkset(uri:Union[AbsoluteURI, str], acceptType:Union[MediaType, str]=None) -> Signposting:
    """Parse linkset to find <link> elements for signposting.
    
    HTTP redirects will be followed.

    :param uri: An absolute http/https URI, which HTML will be inspected.
    :param acceptType: A `MediaType` to content-negotiate access for. 
        The default is to content-negotiate including ``application/linkset`` and 
        ``application/linkset+json`` with JSON having preference.
    :throws ValueError: If the `uri` is invalid
    :throws IOError: If the network request failed, e.g. connection timeout
    :throws requests.HTTPError: If the HTTP request failed, e.g. 404 Not Found
    :throws UnrecognizedContentType: If the HTTP resource was not a recognized linkset content type. 
        This exception is also raised if ``acceptType`` was provided, 
        but didn't match returned ``Content-Type``.
    :throws HTMLParser.HTMLParseError: If the HTML could not be parsed.
    :returns: A parsed `Signposting` object (which may be empty)
    """
    if acceptType:
        linkset = _get_linkset(AbsoluteURI(uri), MediaType(acceptType))
    else:
        linkset = _get_linkset(AbsoluteURI(uri))

    if isinstance(linkset, LinksetJSON):
        return _parse_linkset_json(linkset)
    else:
        return _parse_linkset(linkset)

class LinksetJSON(DownloadedText):
    """Downloaded application/linkset+json document as string"""
    pass

class Linkset(DownloadedText):
    """Downloaded application/linkset document as a string"""
    pass

DEFAULT_ACCEPT = "application/linkset+json,application/linkset;q=0.9,application/json;q=0.3,text/plain;q=0.2"

def _get_linkset(uri:AbsoluteURI, acceptType:MediaType=None) -> Union[LinksetJSON,Linkset]:
    header = {
        "Accept": acceptType and str(acceptType) or DEFAULT_ACCEPT
    }
    # Should ideally throw Not Acceptable error if none of the above
    page = requests.get(uri, headers=header)

    resolved_url = AbsoluteURI(page.url, uri)

    # Note: According to HTTP/1.1 updates (Appendix B) in 
    # https://datatracker.ietf.org/doc/html/rfc7231
    # then Content-Location should NO LONGER be used for 
    # resolving relative URI references.
    ##if "Content-Location" in page.headers:
    ##    # More specific, e.g. "index.en.html" - parse as relative URI reference
    ##    resolved_url = AbsoluteURI(page.headers["Content-Location"], resolved_url)

    if page.status_code == 203:
        warnings.warn("203 Non-Authoritative Information <%s> - Signposting URIs may have been rewritten by proxy" %
                    resolved_url)
    # raise requests.HTTPError for any other 4xx/5xx error
    page.raise_for_status()
    
    ct = page.headers.get("Content-Type", "")
    if acceptType and not acceptType in ct:
        # mismatch from what we requested explicitly
        raise UnrecognizedContentType(ct, uri)    
    elif "application/linkset+json" in ct or "json" in ct:
        return LinksetJSON(page.text, ct, uri, resolved_url)
    elif "application/linkset" in ct or "text/plain" in ct:
        # NOTE: we covered linkset+json above, which would otherwise also match here
        return Linkset(page.text, ct, uri, resolved_url)
    else:
        # HTTP server didn't honor our default Accept header, we'll bail out here.
        raise UnrecognizedContentType(ct, uri)    

def _parse_linkset(linkset:Linkset) -> Signposting:
    # RFC9264 is based on RFC8288 but also permits newlines.
    # We'll lazily replace them with accepted whitespace:
    link = linkset.replace("\r", " ").replace("\n", " ").strip()
    return find_signposting_http_link([link], linkset.resolved_url)
    # TODO: Filter away links that do not have the desired context?

def _parse_linkset_json(linkset:LinksetJSON) -> Signposting:
    linksetJSON = json.loads(linkset)
    if not "linkset" in linksetJSON or not isinstance(linksetJSON["linkset"], list):
        raise ValueError("Not a valid RFC9264 JSON, top list 'linkset' required")
    signposts: List[Signpost] = []
    for link_context in linksetJSON["linkset"]:
        if "anchor" in link_context:
            anchor = AbsoluteURI(link_context["anchor"], linkset.resolved_url)
        else:
            # The linkset itself
            anchor = linkset.resolved_url
        for rel in link_context:
            if rel == "anchor": 
                # Not a link relation, handled above
                continue
            if not rel in SIGNPOSTING:
                # Not a signposting relation, ignored
                continue
            # Proceed to find signposts
            if not isinstance(link_context[rel], list):
                warnings.warn("Not an array, ignoring link targets for rel=%s" % rel)
                continue
            for link_target in link_context[rel]:
                if not "href" in link_target:
                    warnings.warn("Missing required 'href' attribute, ignoring link target for rel=%s" % rel)
                    continue
                href = link_target["href"]
                type = link_target.get("type")
                profile = link_target.get("profile")
                # Signposting ignores the other attributes for now. 
                # TODO: parse them into a Link object for equivalence with
                # _parse_linkset() 
                s = Signpost(rel, href, type, profile, anchor)
                signposts.append(s)
    if not signposts:
        warnings.warn("No signposts found: <%s>" % linkset.requested_url)
    return Signposting(linkset.resolved_url, signposts)
Navigation

Source code for signposting.linkset

Navigation