Skip to content

Models

Base model

BaseModel

Bases: ABC

Model base class

This is the abstract base class of HTRflow models. It handles batching of inputs, some shared initialization arguments and generic logging.

Concrete model implementations bases this class and defines their prediction method in _predict().

Parameters:

Name Type Description Default
device str | None

Model device as a string, recognizable by torch. Defaults to None, which sets the device to cuda or cpu depending on availability.

None
allow_tf32 bool

Allow running matrix multiplications with TensorFloat-32. This speeds up inference at the expense of inference quality. On Ampere and newer CUDA devices, enabling TF32 can improve performance for matrix multiplications and convolutions. Read more here: https://huggingface.co/docs/diffusers/optimization/fp16#tensorfloat-32

True
allow_cudnn_benchmark bool

When True, enables cuDNN benchmarking to select the fastest convolution algorithms for fixed input sizes, potentially increasing performance. Note that this may introduce nondeterminism. Defaults to False. Read more here: https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32

False
Source code in src/htrflow/models/base_model.py
def __init__(self, device: str | None = None, allow_tf32: bool = True, allow_cudnn_benchmark: bool = False):
    """
    Arguments:
        device: Model device as a string, recognizable by torch. Defaults
            to `None`, which sets the device to `cuda` or `cpu` depending
            on availability.
        allow_tf32: Allow running matrix multiplications with TensorFloat-32.
            This speeds up inference at the expense of inference quality.
            On Ampere and newer CUDA devices, enabling TF32 can improve
            performance for matrix multiplications and convolutions.
            Read more here:
            https://huggingface.co/docs/diffusers/optimization/fp16#tensorfloat-32
        allow_cudnn_benchmark: When True, enables cuDNN benchmarking to
            select the fastest convolution algorithms for fixed input sizes,
            potentially increasing performance. Note that this may introduce
            nondeterminism. Defaults to False.
            Read more here:
            https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32
    """
    self.metadata = {"model_class": self.__class__.__name__}
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    self.device = torch.device(device)

    if torch.cuda.is_available():
        # Allow matrix multiplication with TensorFloat-32
        torch.backends.cuda.matmul.allow_tf32 = allow_tf32
        torch.backends.cudnn.allow_tf32 = allow_tf32
        torch.backends.cudnn.benchmark = allow_cudnn_benchmark

_predict abstractmethod

Model specific prediction method

Source code in src/htrflow/models/base_model.py
@abstractmethod
def _predict(self, images: list[NumpyImage], **kwargs) -> list[Result]:
    """Model specific prediction method"""

predict

Perform inference on images

Takes an arbitrary number of inputs and runs batched inference. The inputs can be streamed from an iterator and don't need to be simultaneously read into memory. Prints a progress bar using tqdm. This is a template method which uses the model-specific _predict(...).

Parameters:

Name Type Description Default
images Collection[NumpyImage]

Input images

required
batch_size int

Inference batch size, defaults to 1

1
image_scaling_factor float

If < 1, all input images will be down- scaled by this factor, which can be useful for speeding up inference on higher resolution images. All geometric data in the result (e.g., bounding boxes) are reported with respect to the original resolution.

1.0
tqdm_kwargs dict[str, Any] | None

Optional keyword arguments to control the progress bar.

None
**kwargs

Optional keyword arguments that are forwarded to the model specific prediction method _predict(...).

{}
Source code in src/htrflow/models/base_model.py
def predict(
    self,
    images: Collection[NumpyImage],
    batch_size: int = 1,
    image_scaling_factor: float = 1.0,
    tqdm_kwargs: dict[str, Any] | None = None,
    **kwargs,
) -> list[Result]:
    """Perform inference on images

    Takes an arbitrary number of inputs and runs batched inference.
    The inputs can be streamed from an iterator and don't need to
    be simultaneously read into memory. Prints a progress bar using
    `tqdm`. This is a template method which uses the model-specific
    `_predict(...)`.

    Arguments:
        images: Input images
        batch_size: Inference batch size, defaults to 1
        image_scaling_factor: If < 1, all input images will be down-
            scaled by this factor, which can be useful for speeding
            up inference on higher resolution images. All geometric
            data in the result (e.g., bounding boxes) are reported
            with respect to the original resolution.
        tqdm_kwargs: Optional keyword arguments to control the
            progress bar.
        **kwargs: Optional keyword arguments that are forwarded to
            the model specific prediction method `_predict(...)`.
    """

    batch_size = max(batch_size, 1)  # make sure batch size is at least 1
    image_scaling_factor = max(10e-10, min(image_scaling_factor, 1))  # clip scaling factor to (0, 1]

    n_batches = (len(images) + batch_size - 1) // batch_size
    model_name = self.__class__.__name__
    logger.info(
        "Model '%s' on device '%s' received %d images in batches of %d images per batch (%d batches)",
        model_name,
        self.device,
        len(images),
        batch_size,
        n_batches,
    )

    results = []
    batches = _batch(images, batch_size)
    desc = f"{model_name}: Running inference (batch size {batch_size})"
    for i, batch in enumerate(tqdm(batches, desc, n_batches, **(tqdm_kwargs or {}))):
        msg = "%s: Running inference on %d images (batch %d of %d)"
        logger.info(msg, model_name, len(batch), i + 1, n_batches)
        scaled_batch = [rescale_linear(image, image_scaling_factor) for image in batch]
        batch_results = self._predict(scaled_batch, **kwargs)
        for result in batch_results:
            result.rescale(1 / image_scaling_factor)
            results.append(result)
    return results

Text recognition models

TrOCR

Bases: BaseModel

HTRflow adapter of the tranformer-based OCR model TrOCR.

Uses huggingface's implementation of TrOCR. For further information, see https://huggingface.co/docs/transformers/model_doc/trocr.

Example usage with the TextRecognition step:

- step: TextRecognition
  settings:
    model: TrOCR
    model_settings:
      model: Riksarkivet/trocr-base-handwritten-hist-swe-2
      device: cpu
      model_kwargs:
        revision: 6ecbb5d643430385e1557001ae78682936f8747f
    generation_settings:
      batch_size: 8
      num_beams: 1

Parameters:

Name Type Description Default
model str

Path or name of pretrained VisisonEncoderDeocderModel.

required
processor str | None

Optional path or name of pretrained TrOCRProcessor. If not given, the model path or name is used.

None
model_kwargs dict[str, Any] | None

Model initialization kwargs which are forwarded to VisionEncoderDecoderModel.from_pretrained.

None
processor_kwargs dict[str, Any] | None

Processor initialization kwargs which are forwarded to TrOCRProcessor.from_pretrained.

None
kwargs

Additional kwargs which are forwarded to BaseModel's init.

{}
Source code in src/htrflow/models/huggingface/trocr.py
def __init__(
    self,
    model: str,
    processor: str | None = None,
    model_kwargs: dict[str, Any] | None = None,
    processor_kwargs: dict[str, Any] | None = None,
    **kwargs,
):
    """
    Arguments:
        model: Path or name of pretrained VisisonEncoderDeocderModel.
        processor: Optional path or name of pretrained TrOCRProcessor.
            If not given, the model path or name is used.
        model_kwargs: Model initialization kwargs which are forwarded to
            VisionEncoderDecoderModel.from_pretrained.
        processor_kwargs: Processor initialization kwargs which are
            forwarded to TrOCRProcessor.from_pretrained.
        kwargs: Additional kwargs which are forwarded to BaseModel's
            __init__.
    """
    super().__init__(**kwargs)

    # Initialize model
    model_kwargs = HF_CONFIG | (model_kwargs or {})
    self.model = VisionEncoderDecoderModel.from_pretrained(model, **model_kwargs)
    self.model.to(self.device)
    logger.info("Initialized TrOCR model from %s on device %s.", model, self.model.device)

    # Initialize processor
    processor = processor or model
    processor_kwargs = HF_CONFIG | (processor_kwargs or {})
    self.processor = TrOCRProcessor.from_pretrained(processor, **processor_kwargs)
    logger.info("Initialized TrOCR processor from %s.", processor)

    self.metadata.update(
        {
            "model": model,
            "model_version": model_info(model).sha,
            "processor": processor,
            "processor_version": model_info(processor).sha,
        }
    )

_predict

TrOCR-specific prediction method.

This method is used by predict() and should typically not be called directly. However, predict() forwards additional kwargs to this method.

Parameters:

Name Type Description Default
images list[ndarray]

Input images.

required
**generation_kwargs

Optional keyword arguments that are forwarded to the model's .generate() method.

{}
Source code in src/htrflow/models/huggingface/trocr.py
def _predict(self, images: list[np.ndarray], **generation_kwargs) -> list[Result]:
    """TrOCR-specific prediction method.

    This method is used by `predict()` and should typically not be
    called directly. However, `predict()` forwards additional kwargs
    to this method.

    Arguments:
        images: Input images.
        **generation_kwargs: Optional keyword arguments that are
            forwarded to the model's .generate() method.
    """

    # Prepare generation keyword arguments: Generally, all kwargs are
    # forwarded to the model's .generate method, but some need to be
    # explicitly set (and possibly overridden) to ensure that we get the
    # output format we want.
    generation_kwargs["num_return_sequences"] = generation_kwargs.get("num_beams", 1)
    generation_kwargs["output_scores"] = True
    generation_kwargs["return_dict_in_generate"] = True

    # Do inference
    with torch.no_grad():
        model_inputs = self.processor(images, return_tensors="pt").pixel_values
        model_outputs = self.model.generate(model_inputs.to(self.model.device), **generation_kwargs)

        texts = self.processor.batch_decode(model_outputs.sequences, skip_special_tokens=True)
        scores = self._compute_sequence_scores(model_outputs)

    # Assemble and return a list of Result objects from the prediction outputs.
    # `texts` and `scores` are flattened lists so we need to iterate over them in steps.
    # This is done to ensure that the list of results correspond 1-to-1 with the list of images.
    results = []
    metadata = self.metadata | {"generation_kwargs": generation_kwargs}
    step = generation_kwargs["num_return_sequences"]
    for i in range(0, len(texts), step):
        texts_chunk = texts[i : i + step]
        scores_chunk = scores[i : i + step]
        result = Result.text_recognition_result(metadata, texts_chunk, scores_chunk)
        results.append(result)
    return results

WordLevelTrOCR

Bases: TrOCR

A version of TrOCR which outputs words instead of lines.

This TrOCR wrapper uses the model's attention weights to estimate word boundaries. See notebook [TODO: link] for more details. It does not support beam search, but can otherwise be used as a drop- in replacement of TrOCR.

Example usage with the TextRecognition step:

- step: TextRecognition
  settings:
    model: WordLevelTrOCR
    model_settings:
      model: Riksarkivet/trocr-base-handwritten-hist-swe-2
      device: cpu
      model_kwargs:
        revision: 6ecbb5d643430385e1557001ae78682936f8747f
    generation_settings:
      batch_size: 8

Parameters:

Name Type Description Default
model str

Path or name of pretrained VisisonEncoderDeocderModel.

required
processor str | None

Optional path or name of pretrained TrOCRProcessor. If not given, the model path or name is used.

None
model_kwargs dict[str, Any] | None

Model initialization kwargs which are forwarded to VisionEncoderDecoderModel.from_pretrained.

None
processor_kwargs dict[str, Any] | None

Processor initialization kwargs which are forwarded to TrOCRProcessor.from_pretrained.

None
kwargs

Additional kwargs which are forwarded to BaseModel's init.

{}
Source code in src/htrflow/models/huggingface/trocr.py
def __init__(
    self,
    model: str,
    processor: str | None = None,
    model_kwargs: dict[str, Any] | None = None,
    processor_kwargs: dict[str, Any] | None = None,
    **kwargs,
):
    """
    Arguments:
        model: Path or name of pretrained VisisonEncoderDeocderModel.
        processor: Optional path or name of pretrained TrOCRProcessor.
            If not given, the model path or name is used.
        model_kwargs: Model initialization kwargs which are forwarded to
            VisionEncoderDecoderModel.from_pretrained.
        processor_kwargs: Processor initialization kwargs which are
            forwarded to TrOCRProcessor.from_pretrained.
        kwargs: Additional kwargs which are forwarded to BaseModel's
            __init__.
    """
    super().__init__(**kwargs)

    # Initialize model
    model_kwargs = HF_CONFIG | (model_kwargs or {})
    self.model = VisionEncoderDecoderModel.from_pretrained(model, **model_kwargs)
    self.model.to(self.device)
    logger.info("Initialized TrOCR model from %s on device %s.", model, self.model.device)

    # Initialize processor
    processor = processor or model
    processor_kwargs = HF_CONFIG | (processor_kwargs or {})
    self.processor = TrOCRProcessor.from_pretrained(processor, **processor_kwargs)
    logger.info("Initialized TrOCR processor from %s.", processor)

    self.metadata.update(
        {
            "model": model,
            "model_version": model_info(model).sha,
            "processor": processor,
            "processor_version": model_info(processor).sha,
        }
    )

Satrn

Bases: BaseModel

HTRflow adapter of Openmmlabs' Satrn model

Example usage with the TextRecognition pipeline step:

- step: TextRecognition
  settings:
    model: Satrn
    model_settings:
      model: Riksarkivet/satrn_htr

Parameters:

Name Type Description Default
model str

Path to a local .pth model weights file or to a huggingface repo which contains a .pth file, for example 'Riksarkivet/satrn_htr'.

required
config str | None

Path to a local config.py file or to a huggingface repo which contains a config.py file, for example 'Riksarkivet/satrn_htr'.

None
kwargs

Additional kwargs which are forwarded to BaseModel's __init__.

{}
Source code in src/htrflow/models/openmmlab/satrn.py
def __init__(
    self,
    model: str,
    config: str | None = None,
    revision: str | None = None,
    **kwargs,
) -> None:
    """
    Arguments:
        model: Path to a local .pth model weights file or to a
            huggingface repo which contains a .pth file, for example
            'Riksarkivet/satrn_htr'.
        config: Path to a local config.py file or to a huggingface
            repo which contains a config.py file, for example
            'Riksarkivet/satrn_htr'.
        kwargs: Additional kwargs which are forwarded to BaseModel's
            `__init__`.
    """
    super().__init__(**kwargs)

    config = config or model
    model_weights, model_config = load_mmlabs(model, config, revision)

    with SuppressOutput():
        self.model = TextRecInferencer(model=model_config, weights=model_weights, device=self.device)

    logger.info(
        "Loaded Satrn model '%s' from %s with config %s on device %s",
        model,
        model_weights,
        model_config,
        self.device,
    )

    self.metadata.update(
        {
            "model": model,
            "model_version": commit_hash_from_path(model_weights),
            "config": config,
            "config_version": commit_hash_from_path(model_config),
        }
    )

_predict

Satrn-specific prediction method

This method is used by predict() and should typically not be called directly.

Parameters:

Name Type Description Default
images list[NumpyImage]

Input images

required
kwargs

Additional keyword arguments that are forwarded to mmocr.apis.TextRecInferencer.__call__().

{}
Source code in src/htrflow/models/openmmlab/satrn.py
def _predict(self, images: list[NumpyImage], **kwargs) -> list[Result]:
    """
    Satrn-specific prediction method

    This method is used by `predict()` and should typically not be
    called directly.

    Arguments:
        images: Input images
        kwargs: Additional keyword arguments that are forwarded to
            `mmocr.apis.TextRecInferencer.__call__()`.
    """
    outputs = self.model(
        images,
        batch_size=len(images),
        return_datasamples=False,
        progress_bar=False,
        **kwargs,
    )
    results = []
    for prediction in outputs["predictions"]:
        texts = prediction["text"]
        scores = prediction["scores"]
        results.append(Result.text_recognition_result(self.metadata, texts, scores))
    return results

PyLaia

Bases: BaseModel

A minimal HTRflow-style model wrapper around PyLaia.

Uses Teklia's implementation of PyLaia. For further information, see: https://atr.pages.teklia.com/pylaia/usage/prediction/#decode-arguments

Example usage with the TextRecognition step:

- step: TextRecognition
  settings:
    model: PyLaia
    model_settings:
      model: Teklia/pylaia-belfort
      device: cuda
      revision: d35f921605314afc7324310081bee55a805a0b9f
    generation_settings:
      batch_size: 8
      temperature: 1

Parameters:

Name Type Description Default
model str

The Hugging Face Hub repository ID or a local path with PyLaia artifacts: - weights.ckpt - syms.txt - (optionally) language_model.arpa.gz, lexicon.txt, tokens.txt

required
revision str | None

Optional revision of the Huggingface repository.

None
use_binary_lm bool

Whether to use binary language model format (default: False), see get_pylaia_model for more info.

False
kwargs

Additional kwargs passed to BaseModel.init (e.g., 'device').

{}
Source code in src/htrflow/models/teklia/pylaia.py
def __init__(
    self,
    model: str,
    revision: str | None = None,
    use_binary_lm: bool = False,
    **kwargs,
):
    """
    Arguments:
        model (str):
            The Hugging Face Hub repository ID or a local path with PyLaia artifacts:
            - weights.ckpt
            - syms.txt
            - (optionally) language_model.arpa.gz, lexicon.txt, tokens.txt
        revision: Optional revision of the Huggingface repository.
        use_binary_lm (bool): Whether to use binary language model format (default: False),
                              see `get_pylaia_model` for more info.
        kwargs:
            Additional kwargs passed to BaseModel.__init__ (e.g., 'device').
    """
    super().__init__(**kwargs)

    model_info_dict: PyLaiaModelInfo = get_pylaia_model(model, revision=revision, use_binary_lm=use_binary_lm)
    self.model_dir = model_info_dict.model_dir
    model_version = model_info_dict.model_version
    self.use_language_model = model_info_dict.use_language_model
    self.language_model_params = model_info_dict.language_model_params

    self.metadata.update(
        {
            "model": model,
            "model_version": model_version,
            "use_binary_lm": use_binary_lm,
        }
    )

    logger.info(f"Initialized PyLaiaModel from '{model}' on device '{self.device}'.")

_predict

PyLaia-specific prediction method: runs text recognition.

Parameters:

Name Type Description Default
images list[ndarray]

List of images as NumPy arrays (e.g., shape [H, W, C]).

required
batch_size int

Batch size for decoding. Defaults to 1.

required
reading_order str

Reading order for text recognition. Defaults to "LTR".

required
num_workers int

Number of workers for parallel processing. Defaults to multiprocessing.cpu_count().

required

Returns:

Type Description
list[Result]

list[Result]: A list of Result objects containing recognized text and optionally confidence scores.

Source code in src/htrflow/models/teklia/pylaia.py
def _predict(self, images: list[np.ndarray], **decode_kwargs) -> list[Result]:
    """
    PyLaia-specific prediction method: runs text recognition.

    Args:
        images (list[np.ndarray]):
            List of images as NumPy arrays (e.g., shape [H, W, C]).
        batch_size (int, optional):
            Batch size for decoding. Defaults to 1.
        reading_order (str, optional):
            Reading order for text recognition. Defaults to "LTR".
        num_workers (int, optional):
            Number of workers for parallel processing. Defaults to `multiprocessing.cpu_count()`.

    Returns:
        list[Result]:
            A list of Result objects containing recognized text and
            optionally confidence scores.
    """

    temperature = decode_kwargs.get("temperature", 1.0)
    batch_size = decode_kwargs.get("batch_size", 1)
    reading_order = decode_kwargs.get("reading_order", "LTR")
    num_workers = decode_kwargs.get("num_workers", multiprocessing.cpu_count())

    common_args = CommonArgs(
        checkpoint="weights.ckpt",
        train_path=str(self.model_dir),
        experiment_dirname="",
    )

    data_args = DataArgs(
        batch_size=batch_size, color_mode="L", reading_order=reading_order, num_workers=num_workers
    )

    gpus_flag = 1 if self.device.type == "cuda" else 0
    trainer_args = TrainerArgs(gpus=gpus_flag)

    decode_args = DecodeArgs(
        include_img_ids=True,
        join_string="",
        convert_spaces=True,
        print_line_confidence_scores=True,
        print_word_confidence_scores=False,
        temperature=temperature,
        use_language_model=self.use_language_model,
        **self.language_model_params.model_dump(),
    )

    # Note: PyLaia's 'decode' function expects disk-based file paths rather than in-memory data.
    # Because it is tightly integrated as a CLI tool, we must create temporary image files
    # and pass their paths to the PyLaia decoder. Otherwise, PyLaia cannot process these images.
    tmp_images_dir = Path(mkdtemp())
    logger.debug(f"Created temp folder for images: {tmp_images_dir}")

    image_ids = [str(uuid4()) for _ in images]

    for img_id, np_img in zip(image_ids, images):
        padded_img = _ensure_min_height(np_img, 128)  # Just to fix the min pixel height (defaults to 128)
        cv2.imwrite(str(tmp_images_dir / f"{img_id}.jpg"), padded_img)

    with NamedTemporaryFile() as pred_stdout, NamedTemporaryFile() as img_list:
        Path(img_list.name).write_text("\n".join(image_ids))

        with redirect_stdout(open(pred_stdout.name, mode="w")):
            decode(
                syms=str(self.model_dir / "syms.txt"),
                img_list=img_list.name,
                img_dirs=[str(tmp_images_dir)],
                common=common_args,
                data=data_args,
                trainer=trainer_args,
                decode=decode_args,
                num_workers=num_workers,
            )
            sys.stdout.flush()

        decode_output_lines = Path(pred_stdout.name).read_text().strip().splitlines()

    results = []
    metadata = self.metadata | {"decode_kwargs": decode_kwargs}

    for line in decode_output_lines:
        match = self.LINE_PREDICTION.match(line)
        if not match:
            logger.warning("Could not parse line: %s", line)
            continue
        _, score_str, text = match.groups()  # _ = image_id

        try:
            score_val = float(score_str)
        except ValueError:
            score_val = 0.0

        result = Result.text_recognition_result(metadata, [text], [score_val])
        results.append(result)

    logger.debug(f"PyLaia recognized {len(results)} lines of text.")

    return results

Segmentation models

RTMDet

Bases: BaseModel

HTRFLOW adapter of Openmmlabs' RTMDet model

This model can be used for region and line segmentation. Riksarkivet provides two pre-trained RTMDet models:

-   https://huggingface.co/Riksarkivet/rtmdet_lines
-   https://huggingface.co/Riksarkivet/rtmdet_regions

Parameters:

Name Type Description Default
model str

Path to a local .pth model weights file or to a huggingface repo which contains a .pth file, for example 'Riksarkivet/rtmdet_lines'.

required
config str | None

Path to a local config.py file or to a huggingface repo which contains a config.py file, for example 'Riksarkivet/rtmdet_lines'.

None
revision str | None

A specific model revision, as a commit hash of the model's huggingface repo. If None, the latest available revision is used.

None
kwargs

Additional kwargs which are forwarded to BaseModel's init.

{}
Source code in src/htrflow/models/openmmlab/rtmdet.py
def __init__(
    self,
    model: str,
    config: str | None = None,
    revision: str | None = None,
    **kwargs,
) -> None:
    """
    Arguments:
        model: Path to a local .pth model weights file or to a
            huggingface repo which contains a .pth file, for example
            'Riksarkivet/rtmdet_lines'.
        config: Path to a local config.py file or to a huggingface
            repo which contains a config.py file, for example
            'Riksarkivet/rtmdet_lines'.
        revision: A specific model revision, as a commit hash of the
            model's huggingface repo. If None, the latest available
            revision is used.
        kwargs: Additional kwargs which are forwarded to BaseModel's
            __init__.
    """
    super().__init__(**kwargs)

    config = config or model
    model_weights, model_config = load_mmlabs(model, config, revision)

    with SuppressOutput():
        self.model = DetInferencer(
            model=model_config,
            weights=model_weights,
            device=self.device,
            show_progress=False,
        )

    logger.info(
        "Loaded RTMDet model '%s' from %s with config %s on device %s",
        model,
        model_weights,
        model_config,
        self.device,
    )

    self.metadata.update(
        {
            "model": model,
            "model_version": commit_hash_from_path(model_weights),
            "config": config,
            "config_version": commit_hash_from_path(model_config),
        }
    )

_predict

RTMDet-specific prediction method

This method is used by predict() and should typically not be called directly.

Parameters:

Name Type Description Default
images list[NumpyImage]

List of input images

required
nms_downscale float

If < 1, all masks will be downscaled by this factor before applying NMS. This leads to faster NMS at the expense of accuracy.

1.0
nms_threshold float

Score threshold for segments to keep after NMS.

0.4
nms_sigma float

NMS parameter that affects the score calculation.

2.0
**kwargs

Additional arguments that are passed to DetInferencer.call.

{}
Source code in src/htrflow/models/openmmlab/rtmdet.py
def _predict(
    self,
    images: list[NumpyImage],
    nms_downscale: float = 1.0,
    nms_threshold: float = 0.4,
    nms_sigma: float = 2.0,
    **kwargs,
) -> list[Result]:
    """
    RTMDet-specific prediction method

    This method is used by `predict()` and should typically not be
    called directly.

    Arguments:
        images: List of input images
        nms_downscale: If < 1, all masks will be downscaled by this factor
            before applying NMS. This leads to faster NMS at the expense of
            accuracy.
        nms_threshold: Score threshold for segments to keep after NMS.
        nms_sigma: NMS parameter that affects the score calculation.
        **kwargs: Additional arguments that are passed to DetInferencer.__call__.
    """
    batch_size = max(1, len(images))
    outputs = self.model(
        images,
        batch_size=batch_size,
        draw_pred=False,
        return_datasample=True,
        **kwargs,
    )
    results = []
    for image, output in zip(images, outputs["predictions"]):
        results.append(self._create_segmentation_result(image, output, nms_downscale, nms_threshold, nms_sigma))
    return results

YOLO

Bases: BaseModel

HTRflow adapter of Ultralytics' YOLO model

Example usage with the Segmentation step:

- step: Segmentation
  settings:
    model: YOLO
    model_settings:
      model: Riksarkivet/yolov9-regions-1
      revision: 7c44178d85926b4a096c55c89bf224855a201fbf
      device: cpu
    generation_settings:
      batch_size: 8

generation_settings accepts the same arguments as YOLO.predict(). See the Ultralytics documentation for a list of supported arguments.

Parameters:

Name Type Description Default
model str

Path to a YOLO model. The path can be a path to a local .pt model file (for example, my-model.py) or an indentifier of a Huggingface repo contatining a .pt model file (for example, Riksarkivet/yolov9-regions-1).

required
revision str | None

Optional revision of the Huggingface repository.

None
Source code in src/htrflow/models/ultralytics/yolo.py
def __init__(self, model: str, revision: str | None = None, **kwargs) -> None:
    """
    Arguments:
        model: Path to a YOLO model. The path can be a path to a
            local .pt model file (for example, `my-model.py`) or an
            indentifier of a Huggingface repo contatining a .pt
            model file (for example, `Riksarkivet/yolov9-regions-1`).
        revision: Optional revision of the Huggingface repository.
    """
    super().__init__(**kwargs)

    model_file = load_ultralytics(model, revision)
    self.model = UltralyticsYOLO(model_file).to(self.device)

    logger.info(
        "Initialized YOLO model '%s' from %s on device %s",
        model,
        model_file,
        self.model.device,
    )

    self.metadata.update({"model": model, "model_version": commit_hash_from_path(model_file)})

_predict

Run inference.

Parameters:

Name Type Description Default
images list[ndarray]

Input images

required
use_polygons bool

Wheter to include output polygons (if available), default True.

True
polygon_approx_level float

A parameter which controls the maximum distance between the original polygon and the approximated low-resolution polygon, as a fraction of the original polygon arc length. Example: With polygon_approx_level=0.005 and a generated polygon with arc length 100, the approximated polygon will not differ more than 0.5 units from the original.

0.005
**kwargs

Keyword arguments forwarded to the inner YOLO model instance.

{}
Source code in src/htrflow/models/ultralytics/yolo.py
def _predict(
    self, images: list[np.ndarray], use_polygons: bool = True, polygon_approx_level: float = 0.005, **kwargs
) -> list[Result]:
    """
    Run inference.

    Arguments:
        images: Input images
        use_polygons: Wheter to include output polygons (if available), default True.
        polygon_approx_level: A parameter which controls the maximum distance between the original polygon
            and the approximated low-resolution polygon, as a fraction of the original polygon arc length.
            Example: With `polygon_approx_level=0.005` and a generated polygon with arc length 100, the
            approximated polygon will not differ more than 0.5 units from the original.
        **kwargs: Keyword arguments forwarded to the inner YOLO model instance.
    """
    outputs = self.model(images, stream=True, verbose=False, **kwargs)

    results = []
    for image, output in zip(images, outputs):
        polygons = bboxes = scores = class_labels = None
        if output.boxes is not None:
            bboxes = output.boxes.xyxy.int().tolist()
            scores = output.boxes.conf.tolist()
            class_labels = [output.names[label] for label in output.boxes.cls.tolist()]

        if use_polygons:
            if output.masks is None:
                logger.warning("`use_polygons` was set to True but the model did not return any polygons.")
            else:
                polygons = _simplify_polygons(output.masks.xy, polygon_approx_level)

        result = Result.segmentation_result(
            image.shape[:2],
            bboxes=bboxes,
            polygons=polygons,
            scores=scores,
            labels=class_labels,
            metadata=self.metadata,
        )
        results.append(result)
    return results

Other models

DiT

Bases: BaseModel

HTRFLOW adapter of DiT for image classification.

Uses huggingface's implementation of DiT. For further information about the model, see https://huggingface.co/docs/transformers/model_doc/dit.

Initialize a DiT model

Parameters:

Name Type Description Default
model str

Path or name of pretrained AutoModelForImageClassification.

required
processor str | None

Optional path or name of a pretrained AutoImageProcessor. If not given, the given model is used.

None
model_kwargs dict | None

Model initialization kwargs that are forwarded to AutoModelForImageClassification.from_pretrained().

None
processor_kwargs dict | None

Processor initialization kwargs that are forwarded to AutoImageProcessor.from_pretrained().

None
kwargs

Additional kwargs that are forwarded to BaseModel's init.

required
Source code in src/htrflow/models/huggingface/dit.py
def __init__(
    self,
    model: str,
    processor: str | None = None,
    model_kwargs: dict | None = None,
    processor_kwargs: dict | None = None,
    device: str | None = None,
):
    """Initialize a DiT model

    Arguments:
        model: Path or name of pretrained AutoModelForImageClassification.
        processor: Optional path or name of a pretrained AutoImageProcessor.
            If not given, the given `model` is used.
        model_kwargs: Model initialization kwargs that are forwarded to
            AutoModelForImageClassification.from_pretrained().
        processor_kwargs: Processor initialization kwargs that are forwarded
            to AutoImageProcessor.from_pretrained().
        kwargs: Additional kwargs that are forwarded to BaseModel's __init__.
    """
    super().__init__(device)

    # Initialize model
    model_kwargs = HF_CONFIG | (model_kwargs or {})
    self.model = AutoModelForImageClassification.from_pretrained(model, **model_kwargs)
    self.model.to(self.device)
    logger.info("Initialized DiT model from %s on device %s.", model, self.device)

    # Initialize processor
    processor = processor or model
    processor_kwargs = HF_CONFIG | (processor_kwargs or {})
    self.processor = AutoImageProcessor.from_pretrained(processor, **processor_kwargs)
    logger.info(
        "Initialized DiT processor from %s. Initialization parameters: %s",
        processor,
        processor_kwargs,
    )

    self.metadata.update(
        {
            "model": model,
            "model_version": model_info(model).sha,
            "processor": processor,
            "processor_version": model_info(processor).sha,
        }
    )

_predict

Perform inference on images

Parameters:

Name Type Description Default
images list[ndarray]

List of input images.

required
return_format Literal['argmax', 'softmax']

Decides the format of the output. Options are: - "softmax": returns the confidence scores for each class label and image. Default. - "argmax": returns the most probable class label for each image.

'softmax'
Source code in src/htrflow/models/huggingface/dit.py
def _predict(
    self,
    images: list[np.ndarray],
    return_format: Literal["argmax", "softmax"] = "softmax",
) -> list[Result]:
    """Perform inference on `images`

    Arguments:
        images: List of input images.
        return_format: Decides the format of the output. Options are:
            - "softmax": returns the confidence scores for each class
                label and image. Default.
            - "argmax": returns the most probable class label for each
                image.
    """
    inputs = self.processor(images, return_tensors="pt").pixel_values

    with torch.no_grad():
        batch_logits = self.model(inputs.to(self.model.device)).logits

    return [
        Result(
            metadata=self.metadata,
            data=[{"classification": self._get_labels(logits, return_format)}],
        )
        for logits in batch_logits
    ]