rahcp-lance¶
Manage LanceDB datasets stored on HCP S3.
Quick start¶
import asyncio
from rahcp_client import HCPClient
from rahcp_lance import LanceDataset
async def main():
async with HCPClient.from_env() as client:
ds = LanceDataset(client, bucket="my-bucket", prefix="lance/")
# List tables
tables = await ds.list_tables()
print(tables)
# Create a table from data
table = await ds.create("embeddings", data=[
{"text": "hello", "vector": [0.1, 0.2, 0.3]},
{"text": "world", "vector": [0.4, 0.5, 0.6]},
])
# Get table info
info = await ds.table_info("embeddings")
print(f"{info.name}: {info.num_rows} rows")
# Ingest more data
result = await ds.ingest("embeddings", [
{"text": "foo", "vector": [0.7, 0.8, 0.9]},
])
print(f"Added {result.rows_added}, total {result.total_rows}")
asyncio.run(main())
Creating tables¶
create() requires either schema or data (or both):
import pyarrow as pa
# From data (schema inferred)
table = await ds.create("embeddings", data=[
{"text": "hello", "vector": [0.1, 0.2, 0.3]},
])
# From schema (empty table)
schema = pa.schema([
pa.field("text", pa.utf8()),
pa.field("vector", pa.list_(pa.float32(), 3)),
])
table = await ds.create("embeddings", schema=schema)
Querying¶
from rahcp_lance.query import scan, take, vector_search
from rahcp_lance.schemas import ScanParams, VectorSearchParams
# Scan with filtering and pagination
table = await ds.open("embeddings")
result = await scan(table, ScanParams(
columns=["text", "vector"],
filter="text != 'hello'",
limit=10,
offset=0,
))
# Take specific rows by index
rows = await take(table, [0, 2, 5])
# Vector similarity search (k defaults to 10)
results = await vector_search(table, VectorSearchParams(
vector=[0.1, 0.2, 0.3],
column="vector",
k=5,
))
for r in results:
print(f" distance={r.distance:.4f} data={r.data}")
Data models¶
| Model | Fields |
|---|---|
TableInfo |
name, num_rows, schema_fields: list[FieldInfo] |
FieldInfo |
name, dtype, nullable |
IngestResult |
table, rows_added, total_rows |
ScanParams |
columns, filter, limit, offset |
VectorSearchParams |
vector, column, k (default 10), filter, columns |
SearchResult |
data, distance |