Document AI Utilities

zyx provides a couple of utilities for working with documents & long text a little easier.

Chunking

Utilize the chunk function for quick semantic chunking, with optional parallelization.

from zyx import chunk

chunk("Hello, world!")
# ["Hello, world!"]

API Reference

Takes a string, Document, or a list of strings/Document models and returns the chunked content.

Example

chunk("Hello, world!")
# ["Hello, world!"]

Parameters:

Name	Type	Description	Default
`inputs`	`Union[str, Document, List[Union[str, Document]]]`	Union[str, Document, List[Union[str, Document]]]: The input to chunk.	required
`chunk_size`	`int`	int: The size of the chunks to return.	`512`
`model`	`str`	str: The model to use for chunking.	`'gpt-4'`
`processes`	`int`	int: The number of processes to use for chunking.	`1`
`memoize`	`bool`	bool: Whether to memoize the chunking process.	`True`
`progress`	`bool`	bool: Whether to show a progress bar.	`False`
`max_token_chars`	`int`	int: The maximum number of characters to use for chunking.	`None`

Returns:

Type	Description
`Union[List[str], List[List[str]]]`	Union[List[str], List[List[str]]]: The chunked content.

Source code in zyx/resources/data/chunk.py

def chunk(
    inputs: Union[str, Document, List[Union[str, Document]]],
    chunk_size: int = 512,
    model: str = "gpt-4",
    processes: int = 1,
    memoize: bool = True,
    progress: bool = False,
    max_token_chars: int = None,
) -> Union[List[str], List[List[str]]]:
    """
    Takes a string, Document, or a list of strings/Document models and returns the chunked content.

    Example:
        ```python
        chunk("Hello, world!")
        # ["Hello, world!"]
        ```

    Args:
        inputs: Union[str, Document, List[Union[str, Document]]]: The input to chunk.
        chunk_size: int: The size of the chunks to return.
        model: str: The model to use for chunking.
        processes: int: The number of processes to use for chunking.
        memoize: bool: Whether to memoize the chunking process.
        progress: bool: Whether to show a progress bar.
        max_token_chars: int: The maximum number of characters to use for chunking.

    Returns:
        Union[List[str], List[List[str]]]: The chunked content.
    """
    try:
        tokenizer = tiktoken.encoding_for_model(model)
        chunker = semchunk.chunkerify(
            tokenizer,
            chunk_size=chunk_size,
            max_token_chars=max_token_chars,
            memoize=memoize,
        )

        # Handle single input case (str or Document)
        if isinstance(inputs, (str, Document)):
            inputs = [inputs]  # Convert to list for uniform handling

        if not isinstance(inputs, list):
            raise TypeError(
                "inputs must be a string, Document, or a list of strings/Documents"
            )

        texts = []
        for item in inputs:
            # Handle Document content
            if isinstance(item, Document):
                content = item.content
                # Convert non-string content (e.g., lists from CSV/XLSX) to string
                if isinstance(content, list):
                    content = "\n".join([" | ".join(map(str, row)) for row in content])
                elif not isinstance(content, str):
                    raise TypeError(
                        f"Document content must be a string or list of strings, found {type(content)}"
                    )
                texts.append(content)
            # Handle string input directly
            elif isinstance(item, str):
                texts.append(item)
            else:
                raise TypeError(f"Unsupported input type: {type(item)}")

        # Chunk the content, using processes and progress bar as needed
        if len(texts) == 1:
            return chunker(texts[0])  # Single input, return the chunked result
        else:
            return chunker(
                texts, processes=processes, progress=progress
            )  # Multiple inputs

    except Exception as e:
        # Detailed error logging
        print(f"Error in chunk function: {str(e)}")
        raise e

Reading

Utilize the read function for quick reading of most document types from both local file systems & the web. Able to injest many documents at once, and return a list of Document models.

from zyx import read

read("path/to/file.pdf")
# Document(content="...", metadata={"file_name": "file.pdf", "file_type": "application/pdf", "file_size": 123456})

API Reference

Reads either a file, a directory, or a list of files and returns the content.

Example

read("path/to/file.pdf")
# Document(content="...", metadata={"file_name": "file.pdf", "file_type": "application/pdf", "file_size": 123456})

Parameters:

Name	Type	Description	Default
`path`	`Union[str, Path, List[Union[str, Path]]]`	Union[str, Path, List[Union[str, Path]]]: The path to read.	required
`output`	`Union[Type[str], OutputFormat]`	Union[Type[str], OutputFormat]: The output format.	`'document'`
`target`	`OutputType`	OutputType: The output type.	`'text'`
`verbose`	`bool`	bool: Whether to print verbose output.	`False`
`workers`	`Optional[int]`	Optional[int]: The number of workers to use for reading.	`None`

Returns:

Type	Description
`Union[Document, List[Document], str, Dict, List[Dict]]`	Union[Document, List[Document], str, Dict, List[Dict]]: The content.

Source code in zyx/resources/data/reader.py

def read(
    path: Union[str, Path, List[Union[str, Path]]],
    output: Union[Type[str], OutputFormat] = "document",
    target: OutputType = "text",
    verbose: bool = False,
    workers: Optional[int] = None,
) -> Union[Document, List[Document], str, Dict, List[Dict]]:
    """
    Reads either a file, a directory, or a list of files and returns the content.

    Example:
        ```python
        read("path/to/file.pdf")
        # Document(content="...", metadata={"file_name": "file.pdf", "file_type": "application/pdf", "file_size": 123456})
        ```

    Args:
        path: Union[str, Path, List[Union[str, Path]]]: The path to read.
        output: Union[Type[str], OutputFormat]: The output format.
        target: OutputType: The output type.
        verbose: bool: Whether to print verbose output.
        workers: Optional[int]: The number of workers to use for reading.

    Returns:
        Union[Document, List[Document], str, Dict, List[Dict]]: The content.
    """
    if isinstance(path, list):
        paths = [_download_if_url(p) for p in path]
    else:
        paths = [_download_if_url(path)]

    paths = [Path(p) for p in paths]

    try:
        if len(paths) == 1 and paths[0].is_file():
            result = _read_single_file(
                path=paths[0], output=output, target=target, verbose=verbose
            )
            if output == "json":
                return result  # Directly return the result if it's JSON
            return result
        else:
            with ThreadPoolExecutor(max_workers=workers or mp.cpu_count()) as executor:
                futures = [
                    executor.submit(_read_single_file, file, output, target, verbose)
                    for p in paths
                    for file in (p.glob("*") if p.is_dir() else [p])
                    if file.is_file()
                ]
                results = [future.result() for future in futures]
            if output == "json":
                return results  # Directly return the list of results if it's JSON
            return [result for result in results if result is not None]
    finally:
        # Cleanup temporary files
        for p in paths:
            if str(p).startswith("/tmp/") and p.is_file():
                try:
                    os.remove(p)
                except Exception as e:
                    if verbose:
                        logger.error(f"Error removing temporary file {p}: {str(e)}")