Multimodal Generations

zyx leverages simple functions for multimodal generations. These functions have also been built in as LLM compatible tools as well, to provide multimodal tool calling agents.

Image Generation

Generate images through either the OpenAI or FALAI APIs.

from zyx import image

image("An astronaut riding a rainbow unicorn")

Generates an image using either the FAL_AI API or OpenAI. With an optional display function to show the image in a notebook.

Parameters:

Name	Type	Description	Default
`prompt`	`str`	str,	required
`model`	`ModelType`	ModelType = "dall-e-3",	`'dall-e-3'`
`api_key`	`Optional[str]`	Optional[str] = None,	`None`
`image_size`	`Optional[str]`	Optional[str] = "landscape_4_3",	`'landscape_4_3'`
`num_inference_steps`	`Optional[int]`	Optional[int] = 26,	`26`
`guidance_scale`	`Optional[float]`	Optional[float] = 3.5,	`3.5`
`enable_safety_checker`	`Optional[bool]`	Optional[bool] = False,	`False`
`size`	`Optional[str]`	Optional[str] = "1024x1024",	`'1024x1024'`
`quality`	`Optional[str]`	Optional[str] = "standard",	`'standard'`
`n`	`Optional[int]`	Optional[int] = 1,	`1`
`display`	`Optional[bool]`	Optional[bool] = False,	`False`

Returns:

Type	Description
`Union[str, Any]`	str or Any: The generated image or an error message.

Source code in zyx/resources/ext/multimodal.py

def image(
    prompt: str,
    model: ModelType = "dall-e-3",
    api_key: Optional[str] = None,
    image_size: Optional[str] = "landscape_4_3",
    num_inference_steps: Optional[int] = 26,
    guidance_scale: Optional[float] = 3.5,
    enable_safety_checker: Optional[bool] = False,
    size: Optional[str] = "1024x1024",
    quality: Optional[str] = "standard",
    n: Optional[int] = 1,
    display: Optional[bool] = False,
    optimize_prompt: Optional[bool] = False,
    optimize_prompt_model: Optional[str] = "openai/gpt-4o-mini",
) -> Union[str, Any]:
    """Generates an image using either the FAL_AI API or OpenAI. With an
    optional display function to show the image in a notebook.

    Parameters:
        prompt: str,
        model: ModelType = "dall-e-3",
        api_key: Optional[str] = None,
        image_size: Optional[str] = "landscape_4_3",
        num_inference_steps: Optional[int] = 26,
        guidance_scale: Optional[float] = 3.5,
        enable_safety_checker: Optional[bool] = False,
        size: Optional[str] = "1024x1024",
        quality: Optional[str] = "standard",
        n: Optional[int] = 1,
        display: Optional[bool] = False,

    Returns:
        str or Any: The generated image or an error message.
    """
    model_config = _get_model_config(model)

    if model_config["provider"] == "openai":
        from openai import OpenAI

        try:
            client = OpenAI(api_key=api_key)
        except Exception as e:
            return e
        try:
            response = client.images.generate(
                model=model_config["model"],
                prompt=prompt,
                size=size,
                quality=quality,
                n=n,
            )
        except Exception as e:
            return e
        if display:
            try:
                from IPython.display import display, Image
            except ImportError:
                from ... import logger

                logger.critical(
                    "The display function requires IPython, which is not included in the base 'zyx' package. Please install it with `pip install ipython`."
                )
                prompt_cli_install("IPython")

            url = response.data[0].url
            display(Image(url=url))
        return response

    elif model_config["provider"] == "fal":
        try:
            import fal_client
        except ImportError:
            from ... import logger

            logger.critical(
                "The FAL_AI API requires the 'fal-client' package. Please install it with `pip install fal-client`."
            )
            prompt_cli_install("fal-client")

        if optimize_prompt:
            from ... import completion
            from pydantic import BaseModel

            class OptimizedPrompt(BaseModel):
                prompt: str

            optimized_prompt = completion(
                messages=[
                    {
                        "role": "system",
                        "content": f"""
                 ## CONTEXT ## \n
                 You are a world class image description optimizer. You enhance descriptions of images at incredible quality, detail, but with a focus on being concise. You define your descritpions
                 in a comma list of 2-3 word phrases. \n\n

                 ## INSTRUCTIONS ## \n
                 - You will be given a description of an image
                 - Reason about the image as a whole, and descriptions the user has provided
                 - Optimize the prompt for use in image generation
                 - Ensure that the optimized prompt is a concise, detailed list of 2-3 word phrases.

                 ## EXAMPLE ## \n
                 Original Prompt : [ A beautiful landscape painting of a sunset over the ocean. ] \n
                 Optimized Prompt : [ A beautiful painting, pink vibrant sunset, dynamic ocean waves, vibrant art, 4k, brush strokes ]

                 """,
                    },
                    {
                        "role": "user",
                        "content": f"Optimize this image description for use in image generation. The original prompt is : [ {prompt} ]",
                    },
                ],
                model=optimize_prompt_model,
                response_model=OptimizedPrompt,
            )

            prompt = optimized_prompt.prompt

        try:
            handler = fal_client.submit(
                application=model_config["application"],
                arguments={
                    "prompt": prompt,
                    "image_size": image_size,
                    "num_inference_steps": num_inference_steps,
                    "guidance_scale": guidance_scale,
                    "enable_safety_checker": enable_safety_checker,
                    "num_images": n,
                },
            )
            result = handler.get()
            if display:
                try:
                    from IPython.display import display, Image
                except ImportError:
                    from ... import logger

                    logger.critical(
                        "The display function requires IPython, which is not included in the base 'zyx' package. Please install it with `pip install ipython`."
                    )
                    prompt_cli_install("IPython")

                url = result["images"][0]["url"]
                display(Image(url=url))
        except Exception as e:
            result = e
        return result

Audio Generation

Use the audio() function to generate audio. This is a direct text -> speech.

from zyx import audio

audio("Hello, my name is john!")

API Reference

Generates an audio file from text, through the openai API.

Parameters:

Name	Type	Description	Default
`prompt`	`str`	str,	required
`model`	`OPENAI_TTS_MODELS`	OPENAI_TTS_MODELS = "tts-1",	`'tts-1'`
`voice`	`OPENAI_TTS_VOICES`	OPENAI_TTS_VOICES = "alloy",	`'alloy'`
`api_key`	`Optional[str]`	Optional[str] = None,	`None`
`base_url`	`Optional[str]`	Optional[str] = None,	`None`
`filename`	`Optional[str]`	Optional[str] = None,	`None`
`play`	`bool`	bool = False,	`False`

Returns:

Type	Description
	str or Any: The generated audio file or an error message.

Source code in zyx/resources/ext/multimodal.py

def audio(
    prompt: str,
    model: OPENAI_TTS_MODELS = "tts-1",
    voice: OPENAI_TTS_VOICES = "alloy",
    api_key: Optional[str] = None,
    base_url: Optional[str] = None,
    filename: Optional[str] = None,
    play: bool = False,
):
    """Generates an audio file from text, through the openai API.

    Parameters:
        prompt: str,
        model: OPENAI_TTS_MODELS = "tts-1",
        voice: OPENAI_TTS_VOICES = "alloy",
        api_key: Optional[str] = None,
        base_url: Optional[str] = None,
        filename: Optional[str] = None,
        play: bool = False,

    Returns:
        str or Any: The generated audio file or an error message.
    """
    from openai import OpenAI
    import io

    try:
        import sounddevice as sd
        import soundfile as sf
    except ImportError:
        from ... import logger

        logger.critical(
            "The [italic]speak[/italic] function requires sounddevice and soundfile, which are not included in the base 'zyx' package. Please install them with [bold]`pip install sounddevice soundfile`[/bold]."
        )
        prompt_cli_install("sounddevice soundfile")

    client = OpenAI(api_key=api_key, base_url=base_url)
    try:
        response = client.audio.speech.create(input=prompt, model=model, voice=voice)
        audio_data = response.read()

        try:
            with io.BytesIO(audio_data) as audio_buffer:
                audio_array, sample_rate = sf.read(audio_buffer)
        except Exception as e:
            return e

        if filename:
            file_endings = [".wav", ".mp3", ".m4a"]
            if not filename.endswith(tuple(file_endings)):
                raise ValueError(
                    f"Filename must end with one of the following: {', '.join(file_endings)}"
                )

            sf.write(filename, audio_array, sample_rate)

        if play:
            try:
                from IPython.display import Audio
            except ImportError:
                from ... import logger

                logger.critical(
                    "The [italic]play[/italic] function requires IPython, which is not included in the base 'zyx' package. Please install it with [bold]`pip install ipython`[/bold]."
                )
                prompt_cli_install("IPython")
            # Play audio using sounddevice
            sd.play(audio_array, sample_rate)
            sd.wait()

            # For Jupyter notebook, also return IPython audio widget
            return Audio(audio_array, rate=sample_rate, autoplay=True)
        else:
            return audio_array, sample_rate

    except Exception as e:
        return str(e)

Audio Trancription

Use the transcribe() function to convert audio files into text.

from zyx import transcribe

transcribe("path/to/audio.mp3")

API Reference

Transcribes an audio file into text, through the openai API.

Parameters:

Name	Type	Description	Default
`model`	`str`	str = "whisper-1",	`'whisper-1'`
`api_key`	`Optional[str]`	Optional[str] = None,	`None`
`base_url`	`Optional[str]`	Optional[str] = None,	`None`
`organization`	`Optional[str]`	Optional[str] = None,	`None`
`file`	`Optional[str]`	Optional[str] = None,	`None`
`record`	`bool`	bool = False,	`False`
`duration`	`int`	int = 5,	`5`

Returns:

Type	Description
	str or Any: The transcribed text or an error message.

Source code in zyx/resources/ext/multimodal.py

def transcribe(
    model: str = "whisper-1",
    api_key: Optional[str] = None,
    base_url: Optional[str] = None,
    organization: Optional[str] = None,
    file: Optional[str] = None,
    record: bool = False,
    duration: int = 5,
):
    """Transcribes an audio file into text, through the openai API.

    Parameters:
        model: str = "whisper-1",
        api_key: Optional[str] = None,
        base_url: Optional[str] = None,
        organization: Optional[str] = None,
        file: Optional[str] = None,
        record: bool = False,
        duration: int = 5,

    Returns:
        str or Any: The transcribed text or an error message.
    """
    from openai import OpenAI

    try:
        import sounddevice as sd
        import soundfile as sf
    except ImportError:
        from ... import logger

        logger.critical(
            "The [italic]speak[/italic] function requires sounddevice and soundfile, which are not included in the base 'zyx' package. Please install them with [bold]`pip install sounddevice soundfile`[/bold]."
        )
        prompt_cli_install("sounddevice soundfile")

    import io

    client = OpenAI(api_key=api_key, base_url=base_url, organization=organization)

    if record:
        print(f"Recording for {duration} seconds...")
        audio_data = sd.rec(int(duration * 44100), samplerate=44100, channels=1)
        sd.wait()
        print("Recording finished.")

        with io.BytesIO() as buffer:
            sf.write(buffer, audio_data, 44100, format="wav")
            buffer.seek(0)
            audio_file = buffer
    elif file:
        if not file.endswith((".mp3", ".wav", ".m4a")):
            raise ValueError("File must be a .mp3, .wav, or .m4a file")
        audio_file = open(file, "rb")
    else:
        raise ValueError(
            "Either 'file' must be provided or 'record' must be set to True"
        )

    try:
        transcription = client.audio.transcriptions.create(
            model=model, file=audio_file, response_format="text"
        )
        return transcription
    except Exception as e:
        return str(e)