AI for Transliteration (Voice to text) and text to voice along with voice cloning
Requirement
Question answer assist AI:
1. Listen to audio and retrieve questions using Whisper speech to text
2. Send question to AI model to get answers
3. Tell answers using converted text to speech from 2. Use voice close to use user voice
VB cable setup
https://www.youtube.com/watch?v=GC1aLL7cPY4 mmsys.cpl to open audio settings
Implementation
1. Listen to audio and retrieve questions using Whisper speech to text
The python code below converts voice to text. It uses small model but with better gpu you can go large.
import argparse
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch
from datetime import datetime, timedelta
from queue import Queue
from time import sleep
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="small", help="Model to use",
choices=["tiny", "base", "small", "medium", "large"])
parser.add_argument("--non_english", action="store_false",
help="Don't use the English model.")
parser.add_argument("--energy_threshold", default=1000,
help="Energy level for mic to detect.", type=int)
parser.add_argument("--record_timeout", default=2,
help="How real-time the recording is in seconds.", type=float)
parser.add_argument("--phrase_timeout", default=5,
help="How much empty space between recordings before we "
"consider it a new line in the transcription.", type=float)
args = parser.parse_args()
# Initialization
phrase_time = None
data_queue = Queue()
recorder = sr.Recognizer()
recorder.energy_threshold = args.energy_threshold
recorder.dynamic_energy_threshold = False
# Microphone setup (Windows only)
source = sr.Microphone(sample_rate=16000)
# Check for GPU and set the device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
# Load Whisper model on the appropriate device
#model_name = f"openai/whisper-{args.model}"
model_name = args.model + (".en" if args.model != "large" and not args.non_english else "")
audio_model = whisper.load_model(model_name, device=device)
record_timeout = args.record_timeout
phrase_timeout = args.phrase_timeout
transcription = [""]
# Adjust for ambient noise safely
try:
with source:
if source.stream:
recorder.adjust_for_ambient_noise(source)
except Exception as e:
print(f"Error initializing microphone: {e}")
return
def record_callback(_, audio: sr.AudioData) -> None:
"""Threaded callback function for audio data."""
try:
data = audio.get_raw_data()
data_queue.put(data)
except Exception as e:
print(f"Error in record callback: {e}")
# Background recording
recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
print("Model loaded and ready to transcribe...\n")
while True:
try:
now = datetime.utcnow()
if not data_queue.empty():
phrase_complete = False
if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
phrase_complete = True
phrase_time = now
# Process audio data
audio_data = b"".join(data_queue.queue)
data_queue.queue.clear()
audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
# Transcribe using GPU or CPU
result = audio_model.transcribe(audio_np, fp16=(device == "cuda:0"))
text = result["text"].strip()
# Update transcription
#if phrase_complete:
transcription.append(text)
#else:
# transcription[-1] = text
# Display transcription
os.system("cls" if os.name == "nt" else "clear")
for line in transcription:
print(line)
print("", flush=True) #end="",
sleep(0.25) # Prevent CPU overuse
except KeyboardInterrupt:
break
except Exception as e:
print(f"Unexpected error: {e}")
print("\nFinal Transcription:")
for line in transcription:
print(line)
if __name__ == "__main__":
main()
python whisper-live.py
2. Send question to AI model to get answers
Trying QWEN AI model lets see if i can run 2 models in a GPU Nvidia GeForce GTX 1660 with 6GB VRAM on windows with intel i5 CPU
and 24 GB RAM
1. Install Ollama on windows follow their website
2. ollama run qwen2.5-coder:7b
This will download 4.7 GB model if not existing and run it.