Friday, February 7, 2025

AI for Transliteration (Voice to text) and text to voice along with voice cloning

 AI for Transliteration (Voice to text) and text to voice along with voice cloning

Requirement 

Question answer assist AI:

1. Listen to audio and retrieve questions using Whisper speech to text

2. Send question to AI model to get answers

3. Tell answers using converted text to speech from 2. Use voice close to use user voice

VB cable setup 

https://www.youtube.com/watch?v=GC1aLL7cPY4  mmsys.cpl to open audio settings

Implementation 
1. Listen to audio and retrieve questions using Whisper speech to text

 The python code below converts voice to text. It uses small model but with better gpu you can go large.


import argparse
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch
from datetime import datetime, timedelta
from queue import Queue
from time import sleep


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default="small", help="Model to use",
                        choices=["tiny", "base", "small", "medium", "large"])
    parser.add_argument("--non_english", action="store_false",
                        help="Don't use the English model.")
    parser.add_argument("--energy_threshold", default=1000,
                        help="Energy level for mic to detect.", type=int)
    parser.add_argument("--record_timeout", default=2,
                        help="How real-time the recording is in seconds.", type=float)
    parser.add_argument("--phrase_timeout", default=5,
                        help="How much empty space between recordings before we "
                             "consider it a new line in the transcription.", type=float)
    args = parser.parse_args()

    # Initialization
    phrase_time = None
    data_queue = Queue()
    recorder = sr.Recognizer()
    recorder.energy_threshold = args.energy_threshold
    recorder.dynamic_energy_threshold = False

    # Microphone setup (Windows only)
    source = sr.Microphone(sample_rate=16000)

    # Check for GPU and set the device
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

    # Load Whisper model on the appropriate device
    #model_name = f"openai/whisper-{args.model}"
    model_name = args.model + (".en" if args.model != "large" and not args.non_english else "")
    audio_model = whisper.load_model(model_name, device=device)

    record_timeout = args.record_timeout
    phrase_timeout = args.phrase_timeout
    transcription = [""]

    # Adjust for ambient noise safely
    try:
        with source:
            if source.stream:
                recorder.adjust_for_ambient_noise(source)
    except Exception as e:
        print(f"Error initializing microphone: {e}")
        return

    def record_callback(_, audio: sr.AudioData) -> None:
        """Threaded callback function for audio data."""
        try:
            data = audio.get_raw_data()
            data_queue.put(data)
        except Exception as e:
            print(f"Error in record callback: {e}")

    # Background recording
    recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)

    print("Model loaded and ready to transcribe...\n")

    while True:
        try:
            now = datetime.utcnow()
            if not data_queue.empty():
                phrase_complete = False
                if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
                    phrase_complete = True
                phrase_time = now

                # Process audio data
                audio_data = b"".join(data_queue.queue)
                data_queue.queue.clear()
                audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

                # Transcribe using GPU or CPU
                result = audio_model.transcribe(audio_np, fp16=(device == "cuda:0"))
                text = result["text"].strip()

                # Update transcription
                #if phrase_complete:
                transcription.append(text)
                #else:
                #    transcription[-1] = text

                # Display transcription
                os.system("cls" if os.name == "nt" else "clear")
                for line in transcription:
                    print(line)
                print("", flush=True) #end="", 

            sleep(0.25)  # Prevent CPU overuse
        except KeyboardInterrupt:
            break
        except Exception as e:
            print(f"Unexpected error: {e}")

    print("\nFinal Transcription:")
    for line in transcription:
        print(line)


if __name__ == "__main__":
    main()
    
python whisper-live.py

2. Send question to AI model to get answers
Trying QWEN AI model lets see if i can run 2 models in a GPU Nvidia GeForce GTX 1660 with 6GB VRAM on windows with intel i5 CPU  and 24 GB RAM
1. Install Ollama on windows follow their website
2. ollama run qwen2.5-coder:7b
This will download 4.7 GB model if not existing and run it.