first_github_upload

cymcymcymcym · cymcymcymcym · commit aa97bc2c6edb · 2024-08-03T23:08:12.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+*.mp3
+*.wav
+.env
+__pycache__/
diff --git a/README.md b/README.md
@@ -0,0 +1,17 @@
+## Voice Chat with LLM
+This code starts a voice chat with a LLM. VAD is used to detect start and end of speech. The language and voice models used are:
+1. Openai Whisper for audio-to-text transcription;
+2. Llama3-8b with Groq api to improve response speed;
+3. Openai TTS for text-to-speech.
+![demo](./demo.png)
+## Deployment
+1. create an .env file
+```.env
+OPENAI_API_KEY=
+GROQ_API_KEY=
+```
+2. run the following commands
+```sh
+pip install -r requirement.txt
+python app.py
+```
diff --git a/api_utils.py b/api_utils.py
@@ -0,0 +1,50 @@
+'''
+from openai import OpenAI
+from dotenv import load_dotenv, find_dotenv
+from groq import Groq
+import os
+
+load_dotenv(find_dotenv())
+api_key=os.environ['OPENAI_API_KEY']
+client=OpenAI(api_key=api_key)
+client_groq=Groq(api_key=os.environ["GROQ_API_KEY"])
+'''
+
+def openai_embed(client,text):
+    response=client.embeddings.create(
+        input=text,
+        model="text-embedding-ada-002" #text-embedding-3-small
+    )
+    return response.data[0].embedding
+
+def audio_to_text(audio_file_path, client):
+    with open(audio_file_path, "rb") as audio_file:
+        transcript = client.audio.transcriptions.create(
+            model="whisper-1", 
+            file=audio_file
+        )
+    text_content = transcript.text
+    return text_content
+
+def text_to_audio(text,client,output_path):
+    try:
+        response = client.audio.speech.create(
+            model="tts-1",
+            voice="alloy",
+            input=text
+        )
+        response.stream_to_file(output_path)
+        return output_path
+    except Exception as e:
+        print(f"Error generating audio: {e}")
+        return None
+
+async def chat_completion(conversation_history, client_groq):
+    response = await client_groq.chat.completions.create(
+        messages=conversation_history,
+        model="llama3-8b-8192",
+        max_tokens=200
+    )
+
+    response_text = response.choices[0].message.content
+    return response_text
diff --git a/app.py b/app.py
@@ -0,0 +1,115 @@
+import os
+import asyncio
+import pyaudio
+import wave
+from dotenv import load_dotenv, find_dotenv
+from openai import OpenAI
+from groq import AsyncGroq
+import webrtcvad
+from pydub import AudioSegment
+import simpleaudio as sa
+import sys
+from api_utils import *
+
+load_dotenv(find_dotenv(),override=True)
+api_key = os.environ['OPENAI_API_KEY']
+groq_api_key = os.environ["GROQ_API_KEY"]
+
+client = OpenAI(api_key=api_key)
+client_groq = AsyncGroq(api_key=groq_api_key)
+
+def record_wav(timeout=2, silence_threshold=1):
+    vad = webrtcvad.Vad()
+    vad.set_mode(3)  # Aggressive mode
+
+    form_1 = pyaudio.paInt16
+    chans = 1
+    samp_rate = 16000
+    frame_duration = 10  # Frame duration in ms (10 ms for finer control)
+    frame_size = int(samp_rate * frame_duration / 1000)
+    chunk = frame_size * chans * 2
+    wav_output_filename = 'input.wav'
+
+    audio = pyaudio.PyAudio()
+    stream = audio.open(format=form_1, rate=samp_rate, channels=chans, input=True, frames_per_buffer=chunk)
+    
+    frames = []
+    sys.stdout.write("Listening...")
+    sys.stdout.flush()
+
+    silence_duration = 0
+    is_speaking = False
+    while True:
+        data = stream.read(chunk)
+        if vad.is_speech(data[:frame_size * 2], samp_rate):
+            frames.append(data)
+            silence_duration = 0
+            if not is_speaking:
+                sys.stdout.write("\rRecording...   ")
+                sys.stdout.flush()
+                is_speaking = True
+        else:
+            if is_speaking:
+                silence_duration += frame_duration / 1000
+                if silence_duration > silence_threshold:
+                    sys.stdout.write("\rFinished recording\n")
+                    sys.stdout.flush()
+                    break
+
+    stream.stop_stream()
+    stream.close()
+    audio.terminate()
+
+    if len(frames) == 0:
+        return None
+
+    wavefile = wave.open(wav_output_filename, 'wb')
+    wavefile.setnchannels(chans)
+    wavefile.setsampwidth(audio.get_sample_size(form_1))
+    wavefile.setframerate(samp_rate)
+    wavefile.writeframes(b''.join(frames))
+    wavefile.close()
+
+    return wav_output_filename
+
+def play_audio(file_path):
+    try:
+        if not os.path.isfile(file_path):
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+        audio = AudioSegment.from_file(file_path)
+        play_obj = sa.play_buffer(audio.raw_data, num_channels=audio.channels, bytes_per_sample=audio.sample_width, sample_rate=audio.frame_rate)
+        play_obj.wait_done()
+    except Exception as e:
+        print(f"Error playing audio file: {e}")
+
+async def main():
+    conversation_history = [
+        {"role": "system", "content": "You are an audio assistant. Use colloquial language and be concise in your responses. If the user is making casual comments, keep the response under 10 words. If the user is asking for technical and academic and emotional support, you may respond in 60 words or so."}
+    ]
+
+    while True:
+        audio_file_path = record_wav()
+
+        if audio_file_path is None:
+            print("No speech detected. Skipping processing.")
+            await asyncio.sleep(1)
+            continue
+
+        question = audio_to_text(audio_file_path, client)
+        print(f"\033[34mUser:\033[0m {question}")
+
+        conversation_history.append({"role": "user", "content": question})
+
+        response_text = await chat_completion(conversation_history, client_groq)
+        print("\033[32mModel:\033[0m", response_text)
+        conversation_history.append({"role": "assistant", "content": response_text})
+
+        response_audio_path = text_to_audio(response_text, client, "response.mp3")
+        if response_audio_path:
+            play_audio(response_audio_path)
+
+        await asyncio.sleep(1)
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/demo.png b/demo.png
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+pyaudio
+wave
+python-dotenv
+openai
+groq
+webrtcvad
+pydub
+simpleaudio

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +*.mp3
 +*.wav
 +.env
 +__pycache__/