Compare commits

...

11 Commits

Author SHA1 Message Date
8aa7bd2e99 cleanup 2024-02-28 12:10:49 -05:00
3c0a9b150b audio playback for both user and assistant 2024-02-28 11:55:34 -05:00
5cc002a110 assistant response is replayable 2024-02-28 11:33:08 -05:00
7562778f18 merged gitignores 2024-02-26 20:21:17 -05:00
4fd825e1ae fixing urls 2024-02-26 20:03:58 -05:00
7acdbb3136 cleaning up readmes 2024-02-26 19:58:28 -05:00
f1c2108bc7 updating readmes 2024-02-26 19:43:10 -05:00
056e1067f4 building frontend and serving with fastapi 2024-02-26 13:17:26 -05:00
1916185f19 cleaning up backend sins now 2024-02-26 11:54:00 -05:00
64bb9f9db3 cleaning up my sins 2024-02-26 11:31:01 -05:00
42c605d992 cleanup 2024-02-25 17:47:25 -05:00
9 changed files with 236 additions and 157 deletions

25
.gitignore vendored
View File

@@ -3,3 +3,28 @@
**/audio **/audio
*.mp3 *.mp3
*.webm *.webm
.env
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Editor directories and files
.vscode/*
!.vscode/extensions.json
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?

View File

@@ -1,2 +1,14 @@
# ai_sandbox # ai_sandbox
A learning arena to learn about the current AI tool landscape
## Subprojects
### [Speech to Speech AI Assistant](./speech-speech/)
AI assistant chat with speech recognition and tts responses
Fullstack
- Vite, TS, React frontend
- fastapi backend
- OpenAI for LLM services

View File

@@ -1,24 +0,0 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Editor directories and files
.vscode/*
!.vscode/extensions.json
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?

35
speech-speech/README.md Normal file
View File

@@ -0,0 +1,35 @@
# Speech to Speech AI Assistant
AI assistant chat with speech recognition and tts responses
Fullstack
- Vite, TS, React frontend
- fastapi backend
- OpenAI for LLM services
## Requirements
- python3
- npm
- OpenAI API token
## Setup
```
cd frontend
npm install
npm run build
cd ../backend
# optionally setup virtual environment of your choice
python3 -m pip install -r requirements.txt
```
# Running
example `backend/.env`
```
OPEN_API_KEY=<apikey>
```
```
cd backend
source .env
uvicorn --port 8080 api:app
```

View File

@@ -1 +0,0 @@
OPENAI_API_KEY=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

View File

@@ -1,8 +1,9 @@
from openai import OpenAI from openai import OpenAI
from fastapi import FastAPI, File, Response, Request from fastapi import FastAPI, File, Response, Request
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from pydantic import BaseModel from pydantic import BaseModel
from io import BytesIO
app = FastAPI() app = FastAPI()
@@ -26,22 +27,20 @@ class Conversation(BaseModel):
@app.post("/get-text") @app.post("/get-text")
def stt(audio: bytes = File()): async def stt(audio: bytes = File()):
with open("audio.webm", "wb+") as f: with BytesIO(audio) as f:
f.write(audio) f.name = "audio.mp3"
transcript = openAI_clinet.audio.transcriptions.create( transcript = openAI_clinet.audio.transcriptions.create(
model="whisper-1", model="whisper-1",
file=f, file=f,
response_format="text", response_format="text",
) )
data = {"len": len(audio), "user-transcript": transcript} data = {"len": len(audio), "user-transcript": transcript}
return data return data
@app.post("/conversation") @app.post("/conversation")
async def get_next_response(request: Request): async def get_next_response(request: Request):
# role = "test"
# res_msg = "temp test response"
messages = await request.json() messages = await request.json()
res = openAI_clinet.chat.completions.create( res = openAI_clinet.chat.completions.create(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
@@ -57,10 +56,11 @@ async def get_next_response(request: Request):
@app.get("/speak") @app.get("/speak")
def tts(text: str): def tts(text: str):
res = openAI_clinet.audio.speech.create( res = openAI_clinet.audio.speech.create(
model="tts-1", model="tts-1", voice="nova", input=text, response_format="mp3"
voice="nova",
input=text,
response_format='mp3'
) )
# this works for now but I need to find a way to stream this to response
return Response(content=res.content, media_type="audio/mp3") return Response(content=res.content, media_type="audio/mp3")
# if this is above other routes it will try and serve files instead of matching
# the intended route
app.mount("/", StaticFiles(directory="dist", html=True), name="static")

View File

@@ -1,34 +1,24 @@
import { useEffect, useRef, useState } from "react"; import { useState } from "react";
import { import { ChatMsg, Controls, Feed, Header } from "./components.tsx";
TbBrandOpenai,
TbMicrophone2,
TbPlayerPlay,
TbPlayerStop,
} from "react-icons/tb";
import "./App.css"; import "./App.css";
type ChatMsg = { let userAudio: Array<Blob> = [];
role: string; let audioBlobs: Array<Blob> = [];
content: string;
};
let audioBlobs = [];
let streamBeingCaptured: MediaStream | null = null; let streamBeingCaptured: MediaStream | null = null;
let mediaRecorder: MediaRecorder | null = null; let mediaRecorder: MediaRecorder | null = null;
function get_mic() { function get_mic() {
if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) { if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
console.log("getUserMedia supported."); console.log("getUserMedia supported.");
return navigator.mediaDevices.getUserMedia({ audio: true }); return navigator.mediaDevices.getUserMedia({ audio: true });
} else {
console.log("getUserMedia not supported on your browser!");
} }
throw "getUserMedia not supported on your browser!";
} }
function startRecord() { function startRecord() {
audioBlobs = []; audioBlobs = [];
get_mic().then((stream) => { get_mic().then((stream) => {
console.log("got mic");
streamBeingCaptured = stream; streamBeingCaptured = stream;
mediaRecorder = new MediaRecorder(stream); mediaRecorder = new MediaRecorder(stream);
console.log("Starting Recording"); console.log("Starting Recording");
@@ -40,6 +30,12 @@ function startRecord() {
} }
function stopRecord() { function stopRecord() {
if (!mediaRecorder) {
throw "MediaRecorder not set";
}
if (!streamBeingCaptured) {
throw "Stream not set";
}
mediaRecorder.stop(); mediaRecorder.stop();
streamBeingCaptured.getTracks() streamBeingCaptured.getTracks()
.forEach((track) => track.stop()); .forEach((track) => track.stop());
@@ -52,67 +48,16 @@ function stopRecord() {
function playRecord() { function playRecord() {
const audioBlob = new Blob(audioBlobs, { type: "audio/webm" }); const audioBlob = new Blob(audioBlobs, { type: "audio/webm" });
const audioUrl = URL.createObjectURL(audioBlob); const audioUrl = URL.createObjectURL(audioBlob);
const audio = new Audio(audioUrl); const audio = new Audio(audioUrl);
audio.play(); audio.play();
} }
function playMsg(msg: ChatMsg) { function App() {
const audio = new Audio("http://100.82.51.22:8001/speak?" + new URLSearchParams({text: msg.content}));
console.log("loading audio and playing?")
audio.play();
}
function Header() {
return (
<header className="header p-3">
<div className="title text-5xl font-extrabold">
Speach to Speech AI example
</div>
</header>
);
}
function Feed(props: { chat: Array[ChatMsg]; setChatStateFn: any }) {
const bottomRef = useRef(null);
const scrollToBottom = () => {
bottomRef.current?.scrollIntoView({ behavior: "smooth" });
};
useEffect(() => {
scrollToBottom();
console.log("scroll?");
});
return (
<div className="feed grow self-center w-5/6 max-w-screen-lg px-6 py-3 overflow-scroll">
<div className="content-center space-y-2 divide-y-4">
{props.chat.filter((m: ChatMsg) => m.role != "system").map((
m: ChatMsg,
i: number,
) => <Msg key={i} msg={m} />)}
</div>
<div ref={bottomRef} />
</div>
);
}
function Msg(props: { msg: ChatMsg }) {
return (
<div className="Messege text-lg">
<span className="font-bold">
{props.msg.role.toUpperCase()}:
</span>
<br />
<span className="ml-8">
{props.msg.content}
</span>
</div>
);
}
function Controls(props: { setChatStateFn: any; chat: Array[ChatMsg] }) {
const [recordState, setRecordState] = useState(false); const [recordState, setRecordState] = useState(false);
const [chatState, setChatState] = useState([{
role: "system",
content: "You are a helpful assistant.",
}]);
function toggleRecord() { function toggleRecord() {
if (recordState == false) { if (recordState == false) {
@@ -125,68 +70,41 @@ function Controls(props: { setChatStateFn: any; chat: Array[ChatMsg] }) {
} }
function sendAudio() { function sendAudio() {
var formData = new FormData(); let formData = new FormData();
formData.append("audio", new Blob(audioBlobs, { type: "audio/webm" })); let audio = new Blob(audioBlobs, { type: "audio/webm" });
fetch("http://100.82.51.22:8001/get-text", { userAudio.push(audio);
formData.append("audio", audio);
fetch("/get-text", {
"method": "POST", "method": "POST",
"body": formData, "body": formData,
}).then((res) => res.json()) }).then((res) => res.json())
.then((res) => { .then((res) => {
console.log(res); setChatState((curState: Array<ChatMsg>) => [
props.setChatStateFn((curState) => [
...curState, ...curState,
{ "role": "user", "content": res["user-transcript"] }, {
"role": "user",
"content": res["user-transcript"],
"audio": URL.createObjectURL(userAudio[userAudio.length - 1]),
},
]); ]);
fetch("http://100.82.51.22:8001/conversation", { fetch("/conversation", {
"method": "POST", "method": "POST",
"body": JSON.stringify([...props.chat, { "body": JSON.stringify([...chatState, {
"role": "user", "role": "user",
"content": res["user-transcript"], "content": res["user-transcript"],
}]), }]),
}).then((res) => res.json()) }).then((res) => res.json())
.then((res) => { .then((res) => {
props.setChatStateFn((curState) => [...curState, res]); setChatState((
console.log("attempting to play result") curState: Array<ChatMsg>,
playMsg(res) ) => [...curState, {
...res,
"audio": "/speak?" + new URLSearchParams({ text: res.content }),
}]);
}); });
}); });
} }
return (
<div className="controls self-center flex justify-evenly p-5 text-5xl border-2 border-b-0 w-1/2 max-w-screen-sm min-w-fit">
<button
onClick={() => toggleRecord()}
className={"inline-flex " + (recordState ? "text-red-500" : "")}
>
{recordState ? <TbPlayerStop /> : <TbMicrophone2 />}
{recordState ? "STOP" : "REC"}
</button>
<button
onClick={() => playRecord()}
className="inline-flex text-green-500"
>
<TbPlayerPlay /> PLAY
</button>
<button
onClick={() => {
sendAudio();
}}
className="inline-flex"
>
<TbBrandOpenai /> SEND
</button>
</div>
);
}
function App() {
const [chatState, setChatState] = useState([{
role: "system",
content: "You are a helpful assistant.",
}]);
return ( return (
<> <>
<div className="h-screen center flex flex-col"> <div className="h-screen center flex flex-col">
@@ -195,7 +113,12 @@ function App() {
<hr className="mx-3 border-t-4" /> <hr className="mx-3 border-t-4" />
</div> </div>
<Feed chat={chatState} setChatStateFn={setChatState} /> <Feed chat={chatState} setChatStateFn={setChatState} />
<Controls setChatStateFn={setChatState} chat={chatState} /> <Controls
recButtonOnClick={toggleRecord}
recordState={recordState}
playButtonOnClick={playRecord}
sendButtonOnClick={sendAudio}
/>
</div> </div>
</> </>
); );

View File

@@ -0,0 +1,106 @@
import { useEffect, useRef } from "react";
import {
TbBrandOpenai,
TbMicrophone2,
TbPlayerPlay,
TbPlayerStop,
} from "react-icons/tb";
export type ChatMsg = {
role: string;
content: string;
audio?: string;
};
export function Header() {
return (
<header className="header p-3">
<div className="title text-5xl font-extrabold">
Speach to Speech AI example
</div>
</header>
);
}
export function Feed(props: { chat: Array<ChatMsg>; setChatStateFn: any }) {
const bottomRef = useRef<any>(null);
const scrollToBottom = () => {
if (bottomRef.current) {
bottomRef.current.scrollIntoView({ behavior: "smooth" });
}
};
useEffect(() => {
scrollToBottom();
console.log("scroll?");
});
return (
<div className="feed grow self-center w-5/6 max-w-screen-lg px-6 py-3 overflow-scroll">
<div className="content-center space-y-2 divide-y-4">
{props.chat.filter((m: ChatMsg) => m.role != "system").map((
m: ChatMsg,
i: number,
) => <Msg key={i} msg={m} />)}
</div>
<div ref={bottomRef} />
</div>
);
}
export function Msg(props: { msg: ChatMsg }) {
return (
<div className="Messege text-lg">
<span className="font-bold">
{props.msg.role.toUpperCase()}:
</span>
<br />
<span className="ml-8">
{props.msg.content}
</span>
<audio
controls
autoPlay={props.msg.role == "assistant"}
src={props.msg.audio}
/>
</div>
);
}
export function Controls(
props: {
recButtonOnClick: Function;
recordState: Boolean;
playButtonOnClick: Function;
sendButtonOnClick: Function;
},
) {
return (
<div className="controls self-center flex justify-evenly p-5 text-5xl border-2 border-b-0 w-1/2 max-w-screen-sm min-w-fit">
<button
onClick={() => props.recButtonOnClick()}
className={"inline-flex " + (props.recordState ? "text-red-500" : "")}
>
{props.recordState ? <TbPlayerStop /> : <TbMicrophone2 />}
{props.recordState ? "STOP" : "REC"}
</button>
<button
onClick={() => props.playButtonOnClick()}
className="inline-flex text-green-500"
>
<TbPlayerPlay /> PLAY
</button>
<button
onClick={() => {
props.sendButtonOnClick();
}}
className="inline-flex"
>
<TbBrandOpenai /> SEND
</button>
</div>
);
}

View File

@@ -9,4 +9,7 @@ export default defineConfig({
"Access-Control-Allow-Origin": '*', "Access-Control-Allow-Origin": '*',
}, },
}, },
build: {
outDir: '../backend/dist/',
},
}); });