Skip to content

Browse

Domain package for browsing and retrieving full page transcriptions from historical documents.

Models

models

Data models for Riksarkivet browse operations.

Models for browsing document pages with full text and metadata.

BrowseResult

Bases: BaseModel

Result from browsing document pages.

Contains page contexts, manifest ID, and optional OAI-PMH metadata.

PageContext

Bases: BaseModel

Full page context for browsing.

Contains transcribed text, ALTO XML URL, and image URLs for a single page.

Operations

BrowseOperations(http_client)

Browse operations for Riksarkivet document collections.

Provides document browsing functionality for viewing specific pages of documents by reference code.

Attributes:

Name Type Description
alto_client

Client for fetching ALTO XML content.

oai_client

Client for OAI-PMH metadata operations.

iiif_client

Client for interacting with IIIF collections and manifests.

Source code in packages/browse-lib/src/ra_mcp_browse_lib/browse_operations.py
38
39
40
41
def __init__(self, http_client: HTTPClient):
    self.alto_client = ALTOClient(http_client=http_client)
    self.oai_client = OAIPMHClient(http_client=http_client)
    self.iiif_client = IIIFClient(http_client=http_client)

browse_document(reference_code, pages, highlight_term=None, max_pages=20, research_context=None, session_id=None) async

Browse specific pages of a document.

Retrieves full transcribed content for specified pages of a document, with optional term highlighting. Supports various page specifications including ranges (1-5), lists (1,3,5), and combinations.

Parameters:

Name Type Description Default
reference_code str

Document identifier (e.g., 'SE/RA/730128/730128.006').

required
pages str

Page specification (e.g., '1-3,5,7-9' or 'all').

required
highlight_term str | None

Optional term to highlight in the returned text.

None
max_pages int

Maximum number of pages to retrieve.

20
research_context str | None

User's research goal (recorded as span attribute for telemetry).

None

Returns:

Type Description
BrowseResult

BrowseResult containing page contexts, document metadata,

BrowseResult

and persistent identifiers. Returns empty contexts if document

BrowseResult

not found or no valid pages.

Source code in packages/browse-lib/src/ra_mcp_browse_lib/browse_operations.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
async def browse_document(
    self,
    reference_code: str,
    pages: str,
    highlight_term: str | None = None,
    max_pages: int = 20,
    research_context: str | None = None,
    session_id: str | None = None,
) -> BrowseResult:
    """Browse specific pages of a document.

    Retrieves full transcribed content for specified pages of a document,
    with optional term highlighting. Supports various page specifications
    including ranges (1-5), lists (1,3,5), and combinations.

    Args:
        reference_code: Document identifier (e.g., 'SE/RA/730128/730128.006').
        pages: Page specification (e.g., '1-3,5,7-9' or 'all').
        highlight_term: Optional term to highlight in the returned text.
        max_pages: Maximum number of pages to retrieve.
        research_context: User's research goal (recorded as span attribute for telemetry).

    Returns:
        BrowseResult containing page contexts, document metadata,
        and persistent identifiers. Returns empty contexts if document
        not found or no valid pages.
    """
    with _tracer.start_as_current_span(
        "BrowseOperations.browse_document",
        attributes={
            "browse.reference_code": reference_code,
            "browse.pages_requested": pages,
            **({"browse.research_context": research_context} if research_context else {}),
            **({"mcp.session.id": session_id} if session_id else {}),
        },
    ) as span:
        try:
            # Fetch OAI-PMH metadata once and derive manifest ID from it
            oai_metadata = await self.oai_client.get_metadata(reference_code)
            manifest_id = self.oai_client.manifest_id_from_metadata(oai_metadata)

            if not manifest_id:
                # No manifest = non-digitised material
                # Return metadata but no page contexts
                span.set_attribute("browse.pages_returned", 0)
                _browse_counter.add(1, {"browse.status": "success"})
                _pages_histogram.record(0)
                return BrowseResult(
                    contexts=[],
                    reference_code=reference_code,
                    pages_requested=pages,
                    oai_metadata=oai_metadata,
                )

            page_contexts = await self._fetch_page_contexts(manifest_id, pages, max_pages, reference_code, highlight_term)

            # Count empty pages (blank but digitised)
            empty_count = sum(1 for ctx in page_contexts if not ctx.full_text)
            if empty_count:
                _empty_pages_counter.add(empty_count)

            span.set_attribute("browse.pages_returned", len(page_contexts))
            _browse_counter.add(1, {"browse.status": "success"})
            _pages_histogram.record(len(page_contexts))
            return BrowseResult(
                contexts=page_contexts,
                reference_code=reference_code,
                pages_requested=pages,
                manifest_id=manifest_id,
                oai_metadata=oai_metadata,
            )
        except Exception as e:
            span.set_status(StatusCode.ERROR, str(e))
            span.record_exception(e)
            _browse_counter.add(1, {"browse.status": "error"})
            raise

ALTO Client

ALTOClient(http_client)

Client for fetching and parsing ALTO XML files from Riksarkivet.

ALTO (Analyzed Layout and Text Object) is an XML schema for describing the layout and content of physical text resources. This client handles multiple ALTO namespace versions (v2, v3, v4) and extracts structured text layers from historical document scans.

Attributes:

Name Type Description
http_client

HTTP client instance for making requests to ALTO XML endpoints.

Example

client = ALTOClient(http_client) layer = client.fetch_content("https://sok.riksarkivet.se/dokument/alto/SE_RA_123.xml") print(layer.full_text) # Full transcribed text from the document

Initialize the ALTO client.

Parameters:

Name Type Description Default
http_client HTTPClient

Configured HTTP client for making requests.

required
Source code in packages/xml-lib/src/ra_mcp_xml/client.py
43
44
45
46
47
48
49
50
def __init__(self, http_client: HTTPClient):
    """
    Initialize the ALTO client.

    Args:
        http_client: Configured HTTP client for making requests.
    """
    self.http_client = http_client

fetch_content(alto_url, timeout=10) async

Fetch and parse an ALTO XML file into a structured TextLayer.

This method performs the complete workflow: fetches the XML document, parses it, and returns a TextLayer with line-level data (polygons, confidence, ids), handling multiple ALTO namespace versions automatically.

Parameters:

Name Type Description Default
alto_url str

Direct URL to the ALTO XML document.

required
timeout int

Request timeout in seconds (default: 10).

10

Returns:

Type Description
TextLayer | None

TextLayer with line-level data and full_text,

TextLayer | None

TextLayer with empty full_text if ALTO exists but has no text (blank page),

TextLayer | None

or None if fetching/parsing fails (404, network error, etc.).

Example

layer = await client.fetch_content("https://sok.riksarkivet.se/dokument/alto/SE_RA_123.xml") layer.full_text 'Anno 1676 den 15 Januarii förekom för Rätten...'

Source code in packages/xml-lib/src/ra_mcp_xml/client.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
async def fetch_content(self, alto_url: str, timeout: int = 10) -> TextLayer | None:
    """
    Fetch and parse an ALTO XML file into a structured TextLayer.

    This method performs the complete workflow: fetches the XML document, parses it,
    and returns a TextLayer with line-level data (polygons, confidence, ids),
    handling multiple ALTO namespace versions automatically.

    Args:
        alto_url: Direct URL to the ALTO XML document.
        timeout: Request timeout in seconds (default: 10).

    Returns:
        TextLayer with line-level data and full_text,
        TextLayer with empty full_text if ALTO exists but has no text (blank page),
        or None if fetching/parsing fails (404, network error, etc.).

    Example:
        >>> layer = await client.fetch_content("https://sok.riksarkivet.se/dokument/alto/SE_RA_123.xml")
        >>> layer.full_text
        'Anno 1676 den 15 Januarii förekom för Rätten...'
    """
    with _tracer.start_as_current_span("ALTOClient.fetch_content", attributes={"alto.url": alto_url}) as span:
        headers = {"Accept": "application/xml, text/xml, */*"}
        raw = await self.http_client.get_content(alto_url, timeout=timeout, headers=headers)
        if not raw:
            span.set_attribute("alto.result", "not_found")
            _fetch_counter.add(1, {"alto.result": "not_found"})
            return None

        xml_content = raw.decode("utf-8") if isinstance(raw, bytes) else raw

        try:
            text_layer = detect_and_parse(xml_content)
        except Exception as e:
            logger.warning("Failed to parse ALTO XML from %s: %s", alto_url, e)
            span.set_status(StatusCode.ERROR, f"XML parse error: {e}")
            span.record_exception(e)
            span.set_attribute("alto.result", "parse_error")
            _fetch_counter.add(1, {"alto.result": "parse_error"})
            return None

        result_type = "success" if text_layer.full_text else "empty"
        span.set_attribute("alto.result", result_type)
        span.set_attribute("alto.text_length", len(text_layer.full_text))
        _fetch_counter.add(1, {"alto.result": result_type})
        return text_layer

IIIF Client

IIIFClient(http_client)

Client for IIIF collections and manifests.

Source code in packages/iiif-lib/src/ra_mcp_iiif_lib/client.py
21
22
def __init__(self, http_client: HTTPClient):
    self.http_client = http_client

get_collection(pid, timeout=30) async

Get IIIF collection with typed model.

Parameters:

Name Type Description Default
pid str

Persistent identifier for the collection.

required
timeout int

Request timeout in seconds.

30

Returns:

Type Description
IIIFCollection | None

Parsed collection with manifests, or None on fetch failure.

Source code in packages/iiif-lib/src/ra_mcp_iiif_lib/client.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
async def get_collection(self, pid: str, timeout: int = 30) -> IIIFCollection | None:
    """Get IIIF collection with typed model.

    Args:
        pid: Persistent identifier for the collection.
        timeout: Request timeout in seconds.

    Returns:
        Parsed collection with manifests, or None on fetch failure.
    """
    with _tracer.start_as_current_span("IIIFClient.get_collection", attributes={"iiif.pid": pid}) as span:
        url = f"{COLLECTION_API_BASE_URL}/{pid}"
        data = await self._fetch_json(url, timeout)
        if not data:
            span.set_attribute("iiif.result", "not_found")
            return None

        manifests = self._parse_manifests(data.get("items", []))
        span.set_attribute("iiif.manifests_found", len(manifests))

        return IIIFCollection(
            id=data.get("id", pid),
            label=self._extract_iiif_label(data.get("label")),
            manifests=manifests,
        )

OAI-PMH Client

OAIPMHClient(http_client, base_url=OAI_BASE_URL)

Client for interacting with OAI-PMH repositories.

Source code in packages/oai-pmh-lib/src/ra_mcp_oai_pmh_lib/client.py
30
31
32
def __init__(self, http_client: HTTPClient, base_url: str = OAI_BASE_URL):
    self.http_client = http_client
    self.base_url = base_url

extract_manifest_id(identifier) async

Extract PID from a record for IIIF access.

Source code in packages/oai-pmh-lib/src/ra_mcp_oai_pmh_lib/client.py
64
65
66
67
68
69
70
71
72
73
74
async def extract_manifest_id(self, identifier: str) -> str | None:
    """Extract PID from a record for IIIF access."""
    with _tracer.start_as_current_span("OAIPMHClient.extract_manifest_id", attributes={"oai.identifier": identifier}) as span:
        try:
            metadata = await self.get_metadata(identifier)
            return self.manifest_id_from_metadata(metadata)
        except Exception as e:
            logger.warning("Failed to extract manifest ID for %s: %s", identifier, e)
            span.set_status(StatusCode.ERROR, str(e))
            span.record_exception(e)
            return None

get_metadata(identifier) async

Get record metadata as typed OAIPMHMetadata model.

Fetches the OAI-PMH GetRecord response for the given identifier and parses the EAD metadata into a structured model.

Parameters:

Name Type Description Default
identifier str

Record identifier (e.g., "SE/RA/310187/1").

required

Returns:

Type Description
OAIPMHMetadata | None

Parsed metadata, or None on fetch/parse failure.

Source code in packages/oai-pmh-lib/src/ra_mcp_oai_pmh_lib/client.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
async def get_metadata(self, identifier: str) -> OAIPMHMetadata | None:
    """Get record metadata as typed OAIPMHMetadata model.

    Fetches the OAI-PMH GetRecord response for the given identifier and
    parses the EAD metadata into a structured model.

    Args:
        identifier: Record identifier (e.g., "SE/RA/310187/1").

    Returns:
        Parsed metadata, or None on fetch/parse failure.
    """
    with _tracer.start_as_current_span("OAIPMHClient.get_metadata", attributes={"oai.identifier": identifier}) as span:
        try:
            params: dict[str, str | int] = {"verb": "GetRecord", "identifier": identifier, "metadataPrefix": "oai_ape_ead"}
            xml_root = await self._make_request(params)
            record = self._extract_record(xml_root)

            header_id, datestamp = self._parse_header(record)
            metadata = self._parse_ead_metadata(record, header_id or identifier, datestamp)

            _fetch_counter.add(1, {"oai_pmh.result": "success"})
            return metadata
        except Exception as e:
            logger.warning("Failed to get OAI-PMH metadata for %s: %s", identifier, e)
            span.set_status(StatusCode.ERROR, str(e))
            span.record_exception(e)
            _fetch_counter.add(1, {"oai_pmh.result": "error"})
            return None

manifest_id_from_metadata(metadata)

Extract manifest ID from already-fetched metadata (no HTTP call).

Source code in packages/oai-pmh-lib/src/ra_mcp_oai_pmh_lib/client.py
76
77
78
79
80
def manifest_id_from_metadata(self, metadata: OAIPMHMetadata | None) -> str | None:
    """Extract manifest ID from already-fetched metadata (no HTTP call)."""
    if metadata and metadata.nad_link:
        return self._extract_manifest_id_from_nad_link(metadata.nad_link)
    return None

URL Generator

url_generator

URL generation utilities for Riksarkivet resources.

alto_url(manifest_id, page_number)

Generate ALTO URL from manifest ID and page number.

Parameters:

Name Type Description Default
manifest_id str

Manifest identifier (not PID - should be clean manifest ID)

required
page_number str

Page number

required

Returns:

Type Description
str | None

ALTO XML URL or None if cannot generate

Source code in packages/browse-lib/src/ra_mcp_browse_lib/url_generator.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def alto_url(manifest_id: str, page_number: str) -> str | None:
    """Generate ALTO URL from manifest ID and page number.

    Args:
        manifest_id: Manifest identifier (not PID - should be clean manifest ID)
        page_number: Page number

    Returns:
        ALTO XML URL or None if cannot generate
    """
    try:
        padded_page = format_page_number(page_number)

        if len(manifest_id) >= 4:
            first_4_chars = manifest_id[:4]
            return f"{ALTO_BASE_URL}/{first_4_chars}/{manifest_id}/{manifest_id}_{padded_page}.xml"
        return None
    except Exception as e:
        logger.warning("Failed to generate ALTO URL for manifest=%s page=%s: %s", manifest_id, page_number, e)
        return None

bildvisning_url(manifest_id, page_number, search_term=None)

Generate bildvisning URL with optional search highlighting.

Parameters:

Name Type Description Default
manifest_id str

Manifest ID

required
page_number str

Page number

required
search_term str | None

Optional search term to highlight

None

Returns:

Type Description
str | None

Bildvisning URL or None if cannot generate

Source code in packages/browse-lib/src/ra_mcp_browse_lib/url_generator.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def bildvisning_url(manifest_id: str, page_number: str, search_term: str | None = None) -> str | None:
    """Generate bildvisning URL with optional search highlighting.

    Args:
        manifest_id: Manifest ID
        page_number: Page number
        search_term: Optional search term to highlight

    Returns:
        Bildvisning URL or None if cannot generate
    """
    try:
        clean_manifest_id = remove_arkis_prefix(manifest_id)
        padded_page = format_page_number(page_number)
        base_url = f"{BILDVISNING_BASE_URL}/{clean_manifest_id}_{padded_page}"

        if search_term and search_term.strip():
            encoded_term = urllib.parse.quote(search_term.strip())
            return f"{base_url}#?q={encoded_term}"
        return base_url
    except Exception as e:
        logger.warning("Failed to generate bildvisning URL for manifest=%s page=%s: %s", manifest_id, page_number, e)
        return None

format_page_number(page_number)

Format page number with proper padding.

Parameters:

Name Type Description Default
page_number str

Page number string

required

Returns:

Type Description
str

Padded page number (5 digits)

Source code in packages/browse-lib/src/ra_mcp_browse_lib/url_generator.py
27
28
29
30
31
32
33
34
35
36
37
38
39
def format_page_number(page_number: str) -> str:
    """Format page number with proper padding.

    Args:
        page_number: Page number string

    Returns:
        Padded page number (5 digits)
    """
    clean_page = page_number.lstrip("_")
    if clean_page.isdigit():
        return f"{int(clean_page):05d}"
    return clean_page.zfill(5)

iiif_image_url(manifest_id, page_number)

Generate IIIF image URL from manifest ID and page number.

Parameters:

Name Type Description Default
manifest_id str

Manifest ID

required
page_number str

Page number

required

Returns:

Type Description
str | None

IIIF image URL or None if cannot generate

Source code in packages/browse-lib/src/ra_mcp_browse_lib/url_generator.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def iiif_image_url(manifest_id: str, page_number: str) -> str | None:
    """Generate IIIF image URL from manifest ID and page number.

    Args:
        manifest_id: Manifest ID
        page_number: Page number

    Returns:
        IIIF image URL or None if cannot generate
    """
    try:
        clean_manifest_id = remove_arkis_prefix(manifest_id)
        padded_page = format_page_number(page_number)
        return f"{IIIF_IMAGE_BASE_URL}!{clean_manifest_id}_{padded_page}/full/max/0/default.jpg"
    except Exception as e:
        logger.warning("Failed to generate IIIF image URL for manifest=%s page=%s: %s", manifest_id, page_number, e)
        return None

remove_arkis_prefix(manifest_id)

Remove arkis! prefix from manifest ID if present.

Parameters:

Name Type Description Default
manifest_id str

Manifest ID string, potentially with arkis! prefix

required

Returns:

Type Description
str

Manifest ID without arkis! prefix

Source code in packages/browse-lib/src/ra_mcp_browse_lib/url_generator.py
14
15
16
17
18
19
20
21
22
23
24
def remove_arkis_prefix(manifest_id: str) -> str:
    """Remove arkis! prefix from manifest ID if present.

    Args:
        manifest_id: Manifest ID string, potentially with arkis! prefix

    Returns:
        Manifest ID without arkis! prefix
    """

    return manifest_id.removeprefix("arkis!")