Skip to content

rahcp-lance

Manage LanceDB datasets stored on HCP S3.

Quick start

import asyncio
from rahcp_client import HCPClient
from rahcp_lance import LanceDataset

async def main():
    async with HCPClient.from_env() as client:
        ds = LanceDataset(client, bucket="my-bucket", prefix="lance/")

        # List tables
        tables = await ds.list_tables()
        print(tables)

        # Create a table from data
        table = await ds.create("embeddings", data=[
            {"text": "hello", "vector": [0.1, 0.2, 0.3]},
            {"text": "world", "vector": [0.4, 0.5, 0.6]},
        ])

        # Get table info
        info = await ds.table_info("embeddings")
        print(f"{info.name}: {info.num_rows} rows")

        # Ingest more data
        result = await ds.ingest("embeddings", [
            {"text": "foo", "vector": [0.7, 0.8, 0.9]},
        ])
        print(f"Added {result.rows_added}, total {result.total_rows}")

asyncio.run(main())

Creating tables

create() requires either schema or data (or both):

import pyarrow as pa

# From data (schema inferred)
table = await ds.create("embeddings", data=[
    {"text": "hello", "vector": [0.1, 0.2, 0.3]},
])

# From schema (empty table)
schema = pa.schema([
    pa.field("text", pa.utf8()),
    pa.field("vector", pa.list_(pa.float32(), 3)),
])
table = await ds.create("embeddings", schema=schema)

Querying

from rahcp_lance.query import scan, take, vector_search
from rahcp_lance.schemas import ScanParams, VectorSearchParams

# Scan with filtering and pagination
table = await ds.open("embeddings")
result = await scan(table, ScanParams(
    columns=["text", "vector"],
    filter="text != 'hello'",
    limit=10,
    offset=0,
))

# Take specific rows by index
rows = await take(table, [0, 2, 5])

# Vector similarity search (k defaults to 10)
results = await vector_search(table, VectorSearchParams(
    vector=[0.1, 0.2, 0.3],
    column="vector",
    k=5,
))
for r in results:
    print(f"  distance={r.distance:.4f}  data={r.data}")

Data models

Model Fields
TableInfo name, num_rows, schema_fields: list[FieldInfo]
FieldInfo name, dtype, nullable
IngestResult table, rows_added, total_rows
ScanParams columns, filter, limit, offset
VectorSearchParams vector, column, k (default 10), filter, columns
SearchResult data, distance