|
| 1 | +import os |
| 2 | +import asyncio |
| 3 | +import pyaudio |
| 4 | +import wave |
| 5 | +from dotenv import load_dotenv, find_dotenv |
| 6 | +from openai import OpenAI |
| 7 | +from groq import AsyncGroq |
| 8 | +import webrtcvad |
| 9 | +from pydub import AudioSegment |
| 10 | +import simpleaudio as sa |
| 11 | +import sys |
| 12 | +from api_utils import * |
| 13 | + |
| 14 | +load_dotenv(find_dotenv(),override=True) |
| 15 | +api_key = os.environ['OPENAI_API_KEY'] |
| 16 | +groq_api_key = os.environ["GROQ_API_KEY"] |
| 17 | + |
| 18 | +client = OpenAI(api_key=api_key) |
| 19 | +client_groq = AsyncGroq(api_key=groq_api_key) |
| 20 | + |
| 21 | +def record_wav(timeout=2, silence_threshold=1): |
| 22 | + vad = webrtcvad.Vad() |
| 23 | + vad.set_mode(3) # Aggressive mode |
| 24 | + |
| 25 | + form_1 = pyaudio.paInt16 |
| 26 | + chans = 1 |
| 27 | + samp_rate = 16000 |
| 28 | + frame_duration = 10 # Frame duration in ms (10 ms for finer control) |
| 29 | + frame_size = int(samp_rate * frame_duration / 1000) |
| 30 | + chunk = frame_size * chans * 2 |
| 31 | + wav_output_filename = 'input.wav' |
| 32 | + |
| 33 | + audio = pyaudio.PyAudio() |
| 34 | + stream = audio.open(format=form_1, rate=samp_rate, channels=chans, input=True, frames_per_buffer=chunk) |
| 35 | + |
| 36 | + frames = [] |
| 37 | + sys.stdout.write("Listening...") |
| 38 | + sys.stdout.flush() |
| 39 | + |
| 40 | + silence_duration = 0 |
| 41 | + is_speaking = False |
| 42 | + while True: |
| 43 | + data = stream.read(chunk) |
| 44 | + if vad.is_speech(data[:frame_size * 2], samp_rate): |
| 45 | + frames.append(data) |
| 46 | + silence_duration = 0 |
| 47 | + if not is_speaking: |
| 48 | + sys.stdout.write("\rRecording... ") |
| 49 | + sys.stdout.flush() |
| 50 | + is_speaking = True |
| 51 | + else: |
| 52 | + if is_speaking: |
| 53 | + silence_duration += frame_duration / 1000 |
| 54 | + if silence_duration > silence_threshold: |
| 55 | + sys.stdout.write("\rFinished recording\n") |
| 56 | + sys.stdout.flush() |
| 57 | + break |
| 58 | + |
| 59 | + stream.stop_stream() |
| 60 | + stream.close() |
| 61 | + audio.terminate() |
| 62 | + |
| 63 | + if len(frames) == 0: |
| 64 | + return None |
| 65 | + |
| 66 | + wavefile = wave.open(wav_output_filename, 'wb') |
| 67 | + wavefile.setnchannels(chans) |
| 68 | + wavefile.setsampwidth(audio.get_sample_size(form_1)) |
| 69 | + wavefile.setframerate(samp_rate) |
| 70 | + wavefile.writeframes(b''.join(frames)) |
| 71 | + wavefile.close() |
| 72 | + |
| 73 | + return wav_output_filename |
| 74 | + |
| 75 | +def play_audio(file_path): |
| 76 | + try: |
| 77 | + if not os.path.isfile(file_path): |
| 78 | + raise FileNotFoundError(f"File not found: {file_path}") |
| 79 | + |
| 80 | + audio = AudioSegment.from_file(file_path) |
| 81 | + play_obj = sa.play_buffer(audio.raw_data, num_channels=audio.channels, bytes_per_sample=audio.sample_width, sample_rate=audio.frame_rate) |
| 82 | + play_obj.wait_done() |
| 83 | + except Exception as e: |
| 84 | + print(f"Error playing audio file: {e}") |
| 85 | + |
| 86 | +async def main(): |
| 87 | + conversation_history = [ |
| 88 | + {"role": "system", "content": "You are an audio assistant. Use colloquial language and be concise in your responses. If the user is making casual comments, keep the response under 10 words. If the user is asking for technical and academic and emotional support, you may respond in 60 words or so."} |
| 89 | + ] |
| 90 | + |
| 91 | + while True: |
| 92 | + audio_file_path = record_wav() |
| 93 | + |
| 94 | + if audio_file_path is None: |
| 95 | + print("No speech detected. Skipping processing.") |
| 96 | + await asyncio.sleep(1) |
| 97 | + continue |
| 98 | + |
| 99 | + question = audio_to_text(audio_file_path, client) |
| 100 | + print(f"\033[34mUser:\033[0m {question}") |
| 101 | + |
| 102 | + conversation_history.append({"role": "user", "content": question}) |
| 103 | + |
| 104 | + response_text = await chat_completion(conversation_history, client_groq) |
| 105 | + print("\033[32mModel:\033[0m", response_text) |
| 106 | + conversation_history.append({"role": "assistant", "content": response_text}) |
| 107 | + |
| 108 | + response_audio_path = text_to_audio(response_text, client, "response.mp3") |
| 109 | + if response_audio_path: |
| 110 | + play_audio(response_audio_path) |
| 111 | + |
| 112 | + await asyncio.sleep(1) |
| 113 | + |
| 114 | +if __name__ == "__main__": |
| 115 | + asyncio.run(main()) |
0 commit comments