Skip to content

Commit aa97bc2

Browse files
committed
first_github_upload
0 parents  commit aa97bc2

File tree

6 files changed

+194
-0
lines changed

6 files changed

+194
-0
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
*.mp3
2+
*.wav
3+
.env
4+
__pycache__/

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
## Voice Chat with LLM
2+
This code starts a voice chat with a LLM. VAD is used to detect start and end of speech. The language and voice models used are:
3+
1. Openai Whisper for audio-to-text transcription;
4+
2. Llama3-8b with Groq api to improve response speed;
5+
3. Openai TTS for text-to-speech.
6+
![demo](./demo.png)
7+
## Deployment
8+
1. create an .env file
9+
```.env
10+
OPENAI_API_KEY=
11+
GROQ_API_KEY=
12+
```
13+
2. run the following commands
14+
```sh
15+
pip install -r requirement.txt
16+
python app.py
17+
```

api_utils.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
'''
2+
from openai import OpenAI
3+
from dotenv import load_dotenv, find_dotenv
4+
from groq import Groq
5+
import os
6+
7+
load_dotenv(find_dotenv())
8+
api_key=os.environ['OPENAI_API_KEY']
9+
client=OpenAI(api_key=api_key)
10+
client_groq=Groq(api_key=os.environ["GROQ_API_KEY"])
11+
'''
12+
13+
def openai_embed(client,text):
14+
response=client.embeddings.create(
15+
input=text,
16+
model="text-embedding-ada-002" #text-embedding-3-small
17+
)
18+
return response.data[0].embedding
19+
20+
def audio_to_text(audio_file_path, client):
21+
with open(audio_file_path, "rb") as audio_file:
22+
transcript = client.audio.transcriptions.create(
23+
model="whisper-1",
24+
file=audio_file
25+
)
26+
text_content = transcript.text
27+
return text_content
28+
29+
def text_to_audio(text,client,output_path):
30+
try:
31+
response = client.audio.speech.create(
32+
model="tts-1",
33+
voice="alloy",
34+
input=text
35+
)
36+
response.stream_to_file(output_path)
37+
return output_path
38+
except Exception as e:
39+
print(f"Error generating audio: {e}")
40+
return None
41+
42+
async def chat_completion(conversation_history, client_groq):
43+
response = await client_groq.chat.completions.create(
44+
messages=conversation_history,
45+
model="llama3-8b-8192",
46+
max_tokens=200
47+
)
48+
49+
response_text = response.choices[0].message.content
50+
return response_text

app.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import os
2+
import asyncio
3+
import pyaudio
4+
import wave
5+
from dotenv import load_dotenv, find_dotenv
6+
from openai import OpenAI
7+
from groq import AsyncGroq
8+
import webrtcvad
9+
from pydub import AudioSegment
10+
import simpleaudio as sa
11+
import sys
12+
from api_utils import *
13+
14+
load_dotenv(find_dotenv(),override=True)
15+
api_key = os.environ['OPENAI_API_KEY']
16+
groq_api_key = os.environ["GROQ_API_KEY"]
17+
18+
client = OpenAI(api_key=api_key)
19+
client_groq = AsyncGroq(api_key=groq_api_key)
20+
21+
def record_wav(timeout=2, silence_threshold=1):
22+
vad = webrtcvad.Vad()
23+
vad.set_mode(3) # Aggressive mode
24+
25+
form_1 = pyaudio.paInt16
26+
chans = 1
27+
samp_rate = 16000
28+
frame_duration = 10 # Frame duration in ms (10 ms for finer control)
29+
frame_size = int(samp_rate * frame_duration / 1000)
30+
chunk = frame_size * chans * 2
31+
wav_output_filename = 'input.wav'
32+
33+
audio = pyaudio.PyAudio()
34+
stream = audio.open(format=form_1, rate=samp_rate, channels=chans, input=True, frames_per_buffer=chunk)
35+
36+
frames = []
37+
sys.stdout.write("Listening...")
38+
sys.stdout.flush()
39+
40+
silence_duration = 0
41+
is_speaking = False
42+
while True:
43+
data = stream.read(chunk)
44+
if vad.is_speech(data[:frame_size * 2], samp_rate):
45+
frames.append(data)
46+
silence_duration = 0
47+
if not is_speaking:
48+
sys.stdout.write("\rRecording... ")
49+
sys.stdout.flush()
50+
is_speaking = True
51+
else:
52+
if is_speaking:
53+
silence_duration += frame_duration / 1000
54+
if silence_duration > silence_threshold:
55+
sys.stdout.write("\rFinished recording\n")
56+
sys.stdout.flush()
57+
break
58+
59+
stream.stop_stream()
60+
stream.close()
61+
audio.terminate()
62+
63+
if len(frames) == 0:
64+
return None
65+
66+
wavefile = wave.open(wav_output_filename, 'wb')
67+
wavefile.setnchannels(chans)
68+
wavefile.setsampwidth(audio.get_sample_size(form_1))
69+
wavefile.setframerate(samp_rate)
70+
wavefile.writeframes(b''.join(frames))
71+
wavefile.close()
72+
73+
return wav_output_filename
74+
75+
def play_audio(file_path):
76+
try:
77+
if not os.path.isfile(file_path):
78+
raise FileNotFoundError(f"File not found: {file_path}")
79+
80+
audio = AudioSegment.from_file(file_path)
81+
play_obj = sa.play_buffer(audio.raw_data, num_channels=audio.channels, bytes_per_sample=audio.sample_width, sample_rate=audio.frame_rate)
82+
play_obj.wait_done()
83+
except Exception as e:
84+
print(f"Error playing audio file: {e}")
85+
86+
async def main():
87+
conversation_history = [
88+
{"role": "system", "content": "You are an audio assistant. Use colloquial language and be concise in your responses. If the user is making casual comments, keep the response under 10 words. If the user is asking for technical and academic and emotional support, you may respond in 60 words or so."}
89+
]
90+
91+
while True:
92+
audio_file_path = record_wav()
93+
94+
if audio_file_path is None:
95+
print("No speech detected. Skipping processing.")
96+
await asyncio.sleep(1)
97+
continue
98+
99+
question = audio_to_text(audio_file_path, client)
100+
print(f"\033[34mUser:\033[0m {question}")
101+
102+
conversation_history.append({"role": "user", "content": question})
103+
104+
response_text = await chat_completion(conversation_history, client_groq)
105+
print("\033[32mModel:\033[0m", response_text)
106+
conversation_history.append({"role": "assistant", "content": response_text})
107+
108+
response_audio_path = text_to_audio(response_text, client, "response.mp3")
109+
if response_audio_path:
110+
play_audio(response_audio_path)
111+
112+
await asyncio.sleep(1)
113+
114+
if __name__ == "__main__":
115+
asyncio.run(main())

demo.png

32.2 KB
Loading

requirements.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
pyaudio
2+
wave
3+
python-dotenv
4+
openai
5+
groq
6+
webrtcvad
7+
pydub
8+
simpleaudio

0 commit comments

Comments
 (0)