From a4dd5f87c0c74cda79554d6a551e750fca0114df Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Fri, 24 Apr 2026 13:36:07 +0530 Subject: [PATCH] feat: add support for document extraction --- client.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/client.py b/client.py index 3d371cb..2daa8d9 100644 --- a/client.py +++ b/client.py @@ -1,7 +1,8 @@ import time +import os import httpx from datetime import datetime, timezone -from typing import Optional +from typing import Optional, Union from .types import Config, MaxunError @@ -161,6 +162,41 @@ async def extract_with_llm(self, options: dict): self.client.post("/extract/llm", json=options, timeout=300) ) + async def create_document_robot( + self, + file: Union[str, bytes], + prompt: str, + robot_name: Optional[str] = None, + ollama_model: Optional[str] = None, + file_name: Optional[str] = None, + ) -> dict: + """Create a document-extraction robot from a PDF file path or bytes.""" + if isinstance(file, str): + file_name = file_name or os.path.basename(file) + with open(file, 'rb') as f: + file_bytes = f.read() + else: + file_bytes = file + file_name = file_name or 'document.pdf' + + data = {'prompt': prompt} + if robot_name: + data['robotName'] = robot_name + if ollama_model: + data['ollamaModel'] = ollama_model + + response = await self.client.post( + '/robots/document', + files={'file': (file_name, file_bytes, 'application/pdf')}, + data=data, + timeout=120, + ) + response.raise_for_status() + body = response.json() + if not body.get('data') and not body.get('robot'): + raise MaxunError('Failed to create document robot') + return body + async def create_crawl_robot(self, url: str, options: dict): return await self._handle( self.client.post("/crawl", json={"url": url, **options})