The following examples demonstrate end-to-end usage of the DataLinks Python SDK across a range of common scenarios. Each example is self-contained and can be run directly after configuring your environment variables.

Direct Ingestion

Demonstrates the simplest possible ingestion flow: load pre-structured rows from a JSON file and push them to DataLinks without any inference pipeline or entity resolution. Use this pattern when your data is already in a clean, tabular format and requires no AI-assisted transformation. Components covered: DLConfig, DataLinksAPI, create_space, ingest, query_data

"""
Direct ingestion example using data/pgproducts.json.

Ingests pre-structured rows directly without any inference pipeline or
entity resolution — the data is stored as-is.
"""
import json
import logging
from pprint import pformat

import datalinks
from datalinks.api import DLConfig


def main():
    logging.basicConfig(level=logging.INFO)

    dl_config = DLConfig.from_env()
    dl_config.namespace = "pg"
    dl_config.objectname = "products_direct"

    dlapi = datalinks.api.DataLinksAPI(dl_config)
    dlapi.create_space(is_private=True)

    jsonfile = "data/pgproducts.json"
    logging.info(f"Loading {jsonfile}")
    with open(jsonfile) as f:
        rows = json.load(f)["rows"]

    logging.info(f"Ingesting {len(rows)} rows")
    result = dlapi.ingest(data=rows)

    logging.info(
        f"Ingestion result: {len(result.successful)} succeeded, "
        f"{len(result.failed)} failed"
    )

    data = dlapi.query_data()
    logging.info(f"Ingested data:\n{pformat(data)}")


if __name__ == "__main__":
    main()

JSON Ingestion with Pipeline and Entity Resolution

Demonstrates how to ingest structured JSON data using a ProcessStructured pipeline step and ExactMatch entity resolution. The pipeline instructs DataLinks to derive tabular rows from the JSON "rows" key, and entity resolution deduplicates records by exact field matching. Components covered: DLConfig, DataLinksAPI, Pipeline, ProcessStructured, MatchTypeConfig, ExactMatch, ingest, query_data

import json
import logging
from pprint import pformat

import datalinks
from datalinks.api import DLConfig
from datalinks.links import EntityResolutionTypes, MatchTypeConfig, ExactMatch
from datalinks.pipeline import Pipeline, ProcessStructured


def main():
    logging.basicConfig(level=logging.INFO)

    dl_config = DLConfig.from_env()
    # we did not set namespace and object because it varies with each example
    dl_config.namespace = "pg"
    dl_config.objectname = "products"

    # OR
    #dl_config = DLConfig(
    #    host="http://localhost:9001",
    #    apikey="", # your DataLinks API key
    #    index="tests",
    #    namespace="pg",
    #    objectname="products"
    #)

    dlapi = datalinks.api.DataLinksAPI(dl_config)
    dlapi.create_space(is_private=True) # default

    jsonfile = "data/pgproducts.json"
    logging.info(f"Loading json data in {jsonfile}")
    with open(jsonfile) as f:
        data = json.load(f)

    steps = Pipeline(
        ProcessStructured(derive_from="rows") # Data is already tabular
    )

    entity_resolution = MatchTypeConfig(ExactMatch())

    result = dlapi.ingest(
        data = [data], # supports multiple files
        inference_steps=steps,
        entity_resolution=entity_resolution,
        batch_size=0 # default (no file batching)
    )

    logging.info(f"Ingestion result:"
                 f"\nSuccessfully ingested {len(result.successful)} dataset(s)."
                 f"\nFailed {len(result.failed)} dataset(s).")

    data = dlapi.query_data(
            model="gpt-4.1-nano-2025-04-14",
            provider="openai"
    )
    logging.info(f"Ingested data:"
                 f"{pformat(data)}")

if __name__ == '__main__':
    main()

Tabular Inference from Unstructured Text

Demonstrates a full AI-powered inference pipeline that transforms raw unstructured text into a structured table. The three-step pipeline uses ProcessUnstructured to extract an initial table from free-form text, Normalize to map columns to a target schema, and Validate to verify row integrity — all powered by an LLM. Components covered: DLConfig, DataLinksAPI, Pipeline, ProcessUnstructured, Normalize, NormalizeModes, Validate, ValidateModes, MatchTypeConfig, ExactMatch, ingest, query_data

import logging
from pprint import pformat

import datalinks
from datalinks.api import DLConfig
from datalinks.links import MatchTypeConfig, ExactMatch
from datalinks.pipeline import Pipeline, ProcessUnstructured, Normalize, NormalizeModes, Validate, ValidateModes


def main():
    logging.basicConfig(level=logging.DEBUG)

    dl_config = DLConfig.from_env()
    # we did not set namespace and object because it varies with each example
    dl_config.namespace = "cinema"
    dl_config.objectname = "awards"

    # OR
    # dl_config = DLConfig(
    #    host="http://localhost:9001",
    #    apikey="", # your DataLinks API key
    #    index="tests",
    #    namespace="pg",
    #    objectname="products"
    # )

    dlapi = datalinks.api.DataLinksAPI(dl_config)
    dlapi.create_space(is_private=True) # default

    textfile = "data/movies.txt"
    logging.info(f"Loading text in {textfile}")
    with open(textfile) as f:
        data = {"text": f.read()}

    steps = Pipeline(
        ProcessUnstructured(
            derive_from="text",
            helper_prompt="If you find a numeric field use only the value and omit the rest.",
            model="gpt-4.1-mini-2025-04-14",
            provider="openai"
        ), # Infer table from unstructured text
        Normalize(
            target_cols={
                "Name": "the actor/actress name",
                "Titles": "the list of notable films where the actor was in",
                "Oscars": "the number of oscars won"
            },
            mode=NormalizeModes.ALL_IN_ONE,
            model="gpt-4.1-mini-2025-04-14",
            provider="openai"
        ),
        Validate(
            mode=ValidateModes.ROWS,
            columns=["Name", "Titles", "Oscars"],
            model="gpt-4.1-mini-2025-04-14",
            provider="openai"
        )
    )

    entity_resolution = MatchTypeConfig(ExactMatch())

    result = dlapi.ingest(
        data = [data], # supports multiple files
        inference_steps=steps,
        entity_resolution=entity_resolution,
        max_attempts=1,
        batch_size=0 # default (no file batching)
    )

    logging.info(f"Ingestion result:"
                 f"\nSuccessfully ingested {len(result.successful)} dataset(s)."
                 f"\nFailed {len(result.failed)} dataset(s).")

    data = dlapi.query_data(
        model="gpt-4.1-mini-2025-04-14",
        provider="openai",
        include_metadata=True
    )
    logging.info(f"Ingested data:\n"
                 f"{pformat(data)}")

if __name__ == '__main__':
    main()

Multipart Upload

Demonstrates how to upload large files to DataLinks using the multipart upload API. The three-phase flow — prepare, upload, finish — streams the file in chunks directly to presigned S3 URLs, avoiding memory constraints for large datasets. If any part fails, the upload session is aborted to free server-side resources. Components covered: DLConfig, DataLinksAPI, prepare_multipart_upload, finish_multipart_upload, wait_for_ingestion, abort_multipart_upload

"""
Multipart upload example using data/pgproducts_mp.json (~10 MB, 2 parts).

Multipart upload is the recommended approach for large files. The flow is:
  1. Prepare  — DataLinks allocates an upload session and returns presigned S3 URLs
                and the server-side partSize to use when splitting the file.
  2. Upload   — Each file chunk (sized to partSize) is PUT directly to its presigned
                URL; S3 returns an ETag per part.
  3. Finish   — DataLinks assembles the parts and triggers ingestion.

If anything goes wrong during upload, abort is called to clean up the partial upload.
"""
import logging
import os

import requests

import datalinks
from datalinks.api import DLConfig
from datalinks.links import ExactMatch, MatchTypeConfig


def upload_multipart(filepath: str):
    dl_config = DLConfig.from_env()
    dl_config.namespace = "pg_multipart"
    dl_config.objectname = "products"

    dlapi = datalinks.api.DataLinksAPI(dl_config)
    dlapi.create_space(is_private=True)

    filename = os.path.basename(filepath)
    size = os.path.getsize(filepath)
    logging.info(f"Preparing multipart upload for '{filename}' ({size:,} bytes)")

    prepare = dlapi.prepare_multipart_upload(filename, size)
    upload_id = prepare["uploadId"]
    key = prepare["key"]
    part_size = prepare["partSize"]
    presigned_urls = [entry["url"] for entry in prepare["presignedUrls"]]
    logging.info(
        f"Upload session ready: {len(presigned_urls)} part(s) of {part_size:,} bytes, "
        f"uploadId={upload_id}"
    )

    parts = []
    part_num = 1
    try:
        with open(filepath, "rb") as f:
            for part_num, url in enumerate(presigned_urls, start=1):
                chunk = f.read(part_size)
                if not chunk:
                    break
                logging.info(
                    f"Uploading part {part_num}/{len(presigned_urls)} ({len(chunk):,} bytes)"
                )
                response = requests.put(url, data=chunk)
                response.raise_for_status()
                etag = response.headers["ETag"].strip('"')
                parts.append({"partNumber": part_num, "etag": etag})
                logging.info(f"Part {part_num} uploaded, ETag={etag}")
    except Exception as e:
        logging.error(f"Upload failed on part {part_num}: {e} — aborting")
        dlapi.abort_multipart_upload(upload_id, key)
        raise

    logging.info("All parts uploaded, finishing ingestion")
    entity_resolution = MatchTypeConfig(ExactMatch())

    result = dlapi.finish_multipart_upload(
        upload_id, key, parts,
        name=filename,
        entity_resolution=entity_resolution,
    )
    ingestion_id = result["id"]
    logging.info(f"Ingestion queued: id={ingestion_id}")

    final = dlapi.wait_for_ingestion(ingestion_id)
    logging.info(
        f"Ingestion finished: status={final.get('status')!r}, "
        f"rows={final.get('processedRows')}, message={final.get('statusMessage')!r}"
    )


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    upload_multipart("data/pgproducts_mp.json")

Interactive Assistant

Demonstrates how to build an interactive command-line assistant that answers natural language questions about your data using the ask streaming API. The example handles each AskEvent type — plan, step, answer, and error — and renders responses using the rich library for a polished terminal experience. Components covered: DLConfig, DataLinksAPI, AskEvent, ask (streaming)

# NOTE:
# This example creates an interactive CLI to ask natural language questions about your data.

import datalinks
from datalinks.api import DLConfig, AskEvent
from rich.console import Console
from rich.markdown import Markdown
from rich.text import Text

console = Console()


def handle_event(event: AskEvent) -> None:
    if event.type == "plan":
        steps = event.data.get("steps", [])
        console.print()
        for step in steps:
            console.print(f"  {step}", style="dim")

    elif event.type == "step":
        idx = event.data.get("index", 0)
        reasoning = event.data.get("reasoning", "")
        query = event.data.get("query", "")
        data = event.data.get("data", [])
        console.print(f"  [{idx + 1}] {reasoning}", style="dim")
        if query:
            console.print(f"      query: {query}", style="dim italic")
        if data:
            console.print(f"      → {len(data)} record(s) retrieved", style="dim")

    elif event.type == "answer":
        response = event.data.get("response", "")
        console.print()
        console.print(Markdown(response))
        console.print()

    elif event.type == "error":
        message = event.data.get("message", "Unknown error")
        console.print(f"\nError: {message}\n", style="bold red")


def ask_loop(dlapi: datalinks.api.DataLinksAPI, namespace: str) -> None:
    console.print(Text.assemble(("DataLinks Assistant", "bold cyan"), " ", (f"({namespace})", "dim")))
    console.print("Type your question and press Enter. Type 'exit' to quit.\n", style="dim")

    while True:
        try:
            question = console.input("[bold yellow]You:[/bold yellow] ").strip()
        except (EOFError, KeyboardInterrupt):
            console.print()
            break

        if not question:
            continue
        if question.lower() == "exit":
            break

        console.print("[bold cyan]DataLinks:[/bold cyan]", end=" ")
        for event in dlapi.ask(question):
            handle_event(event)


def main() -> None:
    dl_config = DLConfig.from_env()

    if dl_config.namespace == "namespace-notset":
        try:
            dl_config.namespace = console.input("Namespace: ").strip()
        except (EOFError, KeyboardInterrupt):
            console.print()
            return

    dl_config.objectname = ""

    dlapi = datalinks.api.DataLinksAPI(dl_config)
    ask_loop(dlapi, dl_config.namespace)


if __name__ == "__main__":
    main()

GitHub Tickets Ingestion with Auto-Modelled Ontology (Vercel)

Demonstrates how to use the IngestProxyAPI to ingest raw GitHub tickets and pull requests and let DataLinks automatically model an ontology from them. A natural-language prompt describes the desired schema — tables for tickets, contributors, triage insights, and applied technologies — and the pipeline surfaces that structure without any manual mapping. Events from the streaming pipeline run are logged as they arrive. The client reconnects automatically if the stream drops, resuming from the last received event. Components covered: IngestProxyConfig, IngestProxyAPI, run_pipeline Required env vars: DL_INGEST_PROXY, DL_API_KEY, DL_USERNAME, DL_NAMESPACE

import json
import logging

from datalinks.api import IngestProxyAPI, IngestProxyConfig


USER_PROMPT = """
## What we are doing
  We are building an assistant to help with the coordinations tasks of a development team. It is effectively a combination of multiple agents that collaborate on a shared memory pool to perform their tasks, these agents include:
  - An agent that helps assign tickets and pull requests to the correct developer for implementation or investigation
  - An agent that helps to estimate effort of specific tasks
  - An agent that does duplicate detection
  - An agent that helps generating prompts for Claude Code to implement whenever the ticket is estimated to be of acceptable complexity and size
  - A workflow that ingests new tickets and and updates existing tickets

  ## What are we working with
  We're feeding all the data to DataLinks automodeler (note: that's you, and thank you for the help!), and we're expecting an ontology to surface automatically. We're submitting raw GitHub tickets and pull requests.
  
  ## What we want
  We want to design and populate an ontology for agents to write and read from, it should have at least the following tables:

1. **Tickets**: Core ticket or PR identity — ticket_id, title, body, labels, state, lock status, reporter, ticket_url.
2. **Contributors**: People/orgs involved (reporter, assignees, closers). Track roles and contributor ids and usernames.
3. **Triage**: Triage insights — inferred priority (Low/Med/High), effort size (Low/Med/High), technology stack, risk level, labels, and any other useful signals.
4. **Applied Technologies** - What are the dependencies and technologies a specific ticket has. A single ticket may have multiple technologies or frameworks, we want them to be represented here.
5. **Additional tables** as needed, we expect the ontology to evolve as new data comes in.

  Rules:
- Use identifiers always as string, never as numbers.
- Always include the available timestamps
"""

def main():
    logging.basicConfig(level=logging.INFO)

    config = IngestProxyConfig.from_env()
    config.namespace = "vercel"

    proxy = IngestProxyAPI(config)

    with open("data/vercel100.json") as f:
        data = json.load(f)

    logging.info(f"Starting pipeline run for {len(data)} records")

    with proxy.run_pipeline(
        data=data,
        user_prompt=USER_PROMPT,
    ) as run:
        logging.info(f"Run ID: {run.run_id}")
        for event in run:
            logging.info(f"Event: {event}")

    logging.info("Pipeline run complete")


if __name__ == "__main__":
    main()

A2A Client (Agent2Agent Protocol)

# This example talks to a DataLinks agent over the Agent2Agent (A2A) protocol
# using the official A2A Python SDK (`a2a-sdk`)
#
# DataLinks exposes every agent as a standard A2A 1.0 server:
#
#   GET  {host}/a2a                                          -> list agents (public)
#   GET  {host}/a2a/{agent}/.well-known/agent-card.json      -> AgentCard (public)
#   POST {host}/a2a/{agent}                                  -> JSON-RPC (bearer auth)
#
# So any A2A-compliant client can drive it. Here we:
#   1. discover the available agents,
#   2. resolve the chosen agent's AgentCard,
#   3. wire the DataLinks API key as the bearer credential for the card's
#      declared security scheme(s),
#   4. stream a natural-language question, passing the target namespace in the
#      message metadata (the server reads `message.metadata.namespace`).
#
# Install the extra dependency first:
#   uv pip install a2a-sdk httpx
#
# Run it (reads DL_HOST / DL_API_KEY / DL_NAMESPACE from the environment / .env):
#   uv run python examples/a2a_client.py "How many orders were placed today?"
#   # optionally pick an agent: DL_A2A_AGENT=nl2ql uv run python examples/a2a_client.py "..."

import asyncio
import os
import sys
from pprint import pformat
from uuid import uuid4

import httpx
from google.protobuf.json_format import MessageToDict
from google.protobuf.struct_pb2 import Struct

from a2a.client import A2ACardResolver, AuthInterceptor, ClientConfig, ClientFactory
from a2a.client import ClientCallContext, InMemoryContextCredentialStore
from a2a.types import Message, Part, Role, SendMessageConfiguration, SendMessageRequest, TaskState

from datalinks.api import DLConfig

SESSION_ID = "datalinks-a2a-example"


async def list_agents(http: httpx.AsyncClient, host: str) -> list[dict]:
    """Return the agents DataLinks exposes over A2A (the public `/a2a` listing)."""
    resp = await http.get(f"{host}/a2a")
    resp.raise_for_status()
    return resp.json()


def render(response) -> None:
    """Print a single streamed A2A event.

    A2A streaming yields a `StreamResponse` whose `payload` oneof is one of
    task / message / status_update / artifact_update. We pull text out of
    whichever variant arrived.
    """
    kind = response.WhichOneof("payload")

    if kind == "message":
        _print_parts(response.message.parts)
    elif kind == "artifact_update":
        artifact = response.artifact_update.artifact
        print(f"\n[{artifact.name or artifact.artifact_id}]")
        _print_parts(artifact.parts)
    elif kind == "status_update":
        status = response.status_update.status
        print(f"\n[status: {TaskState.Name(status.state)}]", flush=True)
        if status.HasField("message"):
            _print_parts(status.message.parts)
    elif kind == "task":
        task = response.task
        print(f"\n[task {task.id}: {TaskState.Name(task.status.state)}]", flush=True)
        for artifact in task.artifacts:
            print(f"[{artifact.name or artifact.artifact_id}]")
            _print_parts(artifact.parts)


def _pprint(value, indent: str = "") -> None:
    """Pretty-print a nested dict/list, prefixing every line with `indent`."""
    text = pformat(value, width=100, sort_dicts=False)
    for line in text.splitlines():
        print(f"{indent}{line}")


def _print_parts(parts) -> None:
    # The nl2ql agent returns the generated query as a TextPart and the result
    # rows as a DataPart (`part.data`). Handling only text drops the rows — which
    # is why the query was the only thing showing up.
    for part in parts:
        content = part.WhichOneof("content")
        if content == "text":
            print(part.text)
        elif content == "data":
            payload = MessageToDict(part.data)  # protobuf Value -> native dict/list
            rows = payload.get("rows") if isinstance(payload, dict) else None
            if rows is not None:
                print(f"  {len(rows)} row(s):")
                _pprint(rows, indent="    ")
            else:
                _pprint(payload, indent="  ")


async def ask(host: str, api_key: str, namespace: str, agent_name: str, question: str) -> None:
    async with httpx.AsyncClient(timeout=httpx.Timeout(120.0)) as http:
        # 1. Resolve the agent's AgentCard from its discovery document.
        agent_base = f"{host}/a2a/{agent_name}"
        card = await A2ACardResolver(http, base_url=agent_base).get_agent_card()

        # 2. Register the bearer credential for whatever scheme(s) the card declares
        #    (DataLinks uses bearerAuth / openId — both are sent as `Bearer <key>`).
        store = InMemoryContextCredentialStore()
        for requirement in card.security_requirements:
            for scheme_name in requirement.schemes:
                await store.set_credentials(SESSION_ID, scheme_name, api_key)

        # 3. Build a streaming client that injects the credential on every call.
        factory = ClientFactory(ClientConfig(httpx_client=http, streaming=True))
        client = factory.create(card, interceptors=[AuthInterceptor(store)])

        # 4. Build the message; the target namespace travels in the metadata.
        metadata = Struct()
        metadata.update({"namespace": namespace})
        request = SendMessageRequest(
            message=Message(
                message_id=str(uuid4()),
                role=Role.ROLE_USER,
                parts=[Part(text=question)],
                metadata=metadata,
            ),
            configuration=SendMessageConfiguration(accepted_output_modes=["text"]),
        )

        # 5. Stream the answer. The ClientCallContext.sessionId matches the
        #    credential store key set above.
        context = ClientCallContext(state={"sessionId": SESSION_ID})
        print(f"Q: {question}\nA: ", end="", flush=True)
        async for response in client.send_message(request, context=context):
            render(response)
        print()


async def main() -> None:
    config = DLConfig.from_env()
    question = " ".join(sys.argv[1:]) or "What can you tell me about this dataset?"

    async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as http:
        agents = await list_agents(http, config.host)

    if not agents:
        print("No A2A agents are exposed by this DataLinks server.", file=sys.stderr)
        return

    agent_name = os.getenv("DL_A2A_AGENT") or agents[0]["name"]
    print(f"Available A2A agents: {', '.join(a['name'] for a in agents)}")
    print(f"Using agent: {agent_name} (namespace: {config.namespace})\n")

    await ask(config.host, config.apikey, config.namespace, agent_name, question)


if __name__ == "__main__":
    asyncio.run(main())

​Direct Ingestion

​JSON Ingestion with Pipeline and Entity Resolution

​Tabular Inference from Unstructured Text

​Multipart Upload

​Interactive Assistant

​GitHub Tickets Ingestion with Auto-Modelled Ontology (Vercel)

​A2A Client (Agent2Agent Protocol)