Compare commits
15 Commits
c8fa61e0c3
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 8aa7bd2e99 | |||
| 3c0a9b150b | |||
| 5cc002a110 | |||
| 7562778f18 | |||
| 4fd825e1ae | |||
| 7acdbb3136 | |||
| f1c2108bc7 | |||
| 056e1067f4 | |||
| 1916185f19 | |||
| 64bb9f9db3 | |||
| 42c605d992 | |||
| b7787be635 | |||
| ebcfa7e19e | |||
| baab95660b | |||
| 8af852d82c |
28
.gitignore
vendored
28
.gitignore
vendored
@@ -1,2 +1,30 @@
|
||||
**/.venv/
|
||||
**/__pycache__/
|
||||
**/audio
|
||||
*.mp3
|
||||
*.webm
|
||||
.env
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
pnpm-debug.log*
|
||||
lerna-debug.log*
|
||||
|
||||
node_modules
|
||||
dist
|
||||
dist-ssr
|
||||
*.local
|
||||
|
||||
# Editor directories and files
|
||||
.vscode/*
|
||||
!.vscode/extensions.json
|
||||
.idea
|
||||
.DS_Store
|
||||
*.suo
|
||||
*.ntvs*
|
||||
*.njsproj
|
||||
*.sln
|
||||
*.sw?
|
||||
|
||||
12
README.md
12
README.md
@@ -1,2 +1,14 @@
|
||||
# ai_sandbox
|
||||
|
||||
A learning arena to learn about the current AI tool landscape
|
||||
|
||||
## Subprojects
|
||||
|
||||
### [Speech to Speech AI Assistant](./speech-speech/)
|
||||
AI assistant chat with speech recognition and tts responses
|
||||
|
||||
Fullstack
|
||||
- Vite, TS, React frontend
|
||||
- fastapi backend
|
||||
- OpenAI for LLM services
|
||||
|
||||
|
||||
24
speech-speech/.gitignore
vendored
24
speech-speech/.gitignore
vendored
@@ -1,24 +0,0 @@
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
pnpm-debug.log*
|
||||
lerna-debug.log*
|
||||
|
||||
node_modules
|
||||
dist
|
||||
dist-ssr
|
||||
*.local
|
||||
|
||||
# Editor directories and files
|
||||
.vscode/*
|
||||
!.vscode/extensions.json
|
||||
.idea
|
||||
.DS_Store
|
||||
*.suo
|
||||
*.ntvs*
|
||||
*.njsproj
|
||||
*.sln
|
||||
*.sw?
|
||||
35
speech-speech/README.md
Normal file
35
speech-speech/README.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Speech to Speech AI Assistant
|
||||
AI assistant chat with speech recognition and tts responses
|
||||
|
||||
Fullstack
|
||||
- Vite, TS, React frontend
|
||||
- fastapi backend
|
||||
- OpenAI for LLM services
|
||||
|
||||
## Requirements
|
||||
- python3
|
||||
- npm
|
||||
- OpenAI API token
|
||||
|
||||
## Setup
|
||||
```
|
||||
cd frontend
|
||||
npm install
|
||||
npm run build
|
||||
|
||||
cd ../backend
|
||||
# optionally setup virtual environment of your choice
|
||||
python3 -m pip install -r requirements.txt
|
||||
```
|
||||
|
||||
# Running
|
||||
example `backend/.env`
|
||||
```
|
||||
OPEN_API_KEY=<apikey>
|
||||
```
|
||||
|
||||
```
|
||||
cd backend
|
||||
source .env
|
||||
uvicorn --port 8080 api:app
|
||||
```
|
||||
@@ -1 +0,0 @@
|
||||
OPENAI_API_KEY=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
|
||||
@@ -1,12 +1,20 @@
|
||||
from openai import OpenAI
|
||||
from fastapi import FastAPI, File, Response, Request
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
import whisper
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
openAI_clinet = OpenAI()
|
||||
model = whisper.load_model("base")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
class ConversationMessege(BaseModel):
|
||||
@@ -19,26 +27,20 @@ class Conversation(BaseModel):
|
||||
|
||||
|
||||
@app.post("/get-text")
|
||||
def get_text(response: Response, audio: bytes = File()):
|
||||
response.headers["Access-Control-Allow-Origin"] = "*"
|
||||
with open("audio", "wb") as f:
|
||||
f.write(audio)
|
||||
# transcript = openAI_clinet.audio.transcriptions.create(
|
||||
# model="whisper-1",
|
||||
# file=audio,
|
||||
# response_format="text",
|
||||
# RequestBody
|
||||
# )
|
||||
result = model.transcribe("audio")
|
||||
data = {"len": len(audio), "user-transcript": result["text"]}
|
||||
async def stt(audio: bytes = File()):
|
||||
with BytesIO(audio) as f:
|
||||
f.name = "audio.mp3"
|
||||
transcript = openAI_clinet.audio.transcriptions.create(
|
||||
model="whisper-1",
|
||||
file=f,
|
||||
response_format="text",
|
||||
)
|
||||
data = {"len": len(audio), "user-transcript": transcript}
|
||||
return data
|
||||
|
||||
|
||||
@app.post("/conversation")
|
||||
async def get_next_response(request: Request, response: Response):
|
||||
response.headers["Access-Control-Allow-Origin"] = "*"
|
||||
#role = "test"
|
||||
#res_msg = "temp test response"
|
||||
async def get_next_response(request: Request):
|
||||
messages = await request.json()
|
||||
res = openAI_clinet.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
@@ -49,3 +51,16 @@ async def get_next_response(request: Request, response: Response):
|
||||
print(messages)
|
||||
print(res_msg)
|
||||
return {"role": role, "content": res_msg}
|
||||
|
||||
|
||||
@app.get("/speak")
|
||||
def tts(text: str):
|
||||
res = openAI_clinet.audio.speech.create(
|
||||
model="tts-1", voice="nova", input=text, response_format="mp3"
|
||||
)
|
||||
return Response(content=res.content, media_type="audio/mp3")
|
||||
|
||||
|
||||
# if this is above other routes it will try and serve files instead of matching
|
||||
# the intended route
|
||||
app.mount("/", StaticFiles(directory="dist", html=True), name="static")
|
||||
|
||||
Binary file not shown.
@@ -2,9 +2,8 @@
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Vite + React + TS</title>
|
||||
<title>Speach to Speech AI example</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
|
||||
@@ -1,47 +1,24 @@
|
||||
import { useEffect, useRef, useState } from "react";
|
||||
import {
|
||||
TbBrandOpenai,
|
||||
TbMicrophone2,
|
||||
TbPlayerPlay,
|
||||
TbPlayerStop,
|
||||
} from "react-icons/tb";
|
||||
import { useState } from "react";
|
||||
import { ChatMsg, Controls, Feed, Header } from "./components.tsx";
|
||||
import "./App.css";
|
||||
|
||||
type ChatMsg = {
|
||||
role: string;
|
||||
content: string;
|
||||
};
|
||||
|
||||
function Header() {
|
||||
return (
|
||||
<header className="header p-3">
|
||||
<div className="title text-5xl font-extrabold">
|
||||
Speach to Speech AI example
|
||||
</div>
|
||||
</header>
|
||||
);
|
||||
}
|
||||
|
||||
let audioBlobs = [];
|
||||
let userAudio: Array<Blob> = [];
|
||||
let audioBlobs: Array<Blob> = [];
|
||||
let streamBeingCaptured: MediaStream | null = null;
|
||||
let mediaRecorder: MediaRecorder | null = null;
|
||||
let chat: Array<ChatMsg> = [{
|
||||
role: "system",
|
||||
content: "You are a helpful assistant.",
|
||||
}];
|
||||
|
||||
function get_mic() {
|
||||
if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
|
||||
console.log("getUserMedia supported.");
|
||||
return navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
} else {
|
||||
console.log("getUserMedia not supported on your browser!");
|
||||
}
|
||||
throw "getUserMedia not supported on your browser!";
|
||||
}
|
||||
|
||||
function startRecord() {
|
||||
audioBlobs = [];
|
||||
get_mic().then((stream) => {
|
||||
console.log("got mic");
|
||||
streamBeingCaptured = stream;
|
||||
mediaRecorder = new MediaRecorder(stream);
|
||||
console.log("Starting Recording");
|
||||
@@ -53,6 +30,12 @@ function startRecord() {
|
||||
}
|
||||
|
||||
function stopRecord() {
|
||||
if (!mediaRecorder) {
|
||||
throw "MediaRecorder not set";
|
||||
}
|
||||
if (!streamBeingCaptured) {
|
||||
throw "Stream not set";
|
||||
}
|
||||
mediaRecorder.stop();
|
||||
streamBeingCaptured.getTracks()
|
||||
.forEach((track) => track.stop());
|
||||
@@ -69,46 +52,12 @@ function playRecord() {
|
||||
audio.play();
|
||||
}
|
||||
|
||||
function Feed(props: { chat: Array[ChatMsg]; setChatStateFn: any }) {
|
||||
const bottomRef = useRef(null);
|
||||
|
||||
const scrollToBottom = () => {
|
||||
bottomRef.current?.scrollIntoView({ behavior: "smooth" });
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
scrollToBottom();
|
||||
console.log("scroll?");
|
||||
});
|
||||
|
||||
return (
|
||||
<div className="feed grow self-center w-5/6 max-w-screen-lg px-6 py-3 overflow-scroll">
|
||||
<div className="content-center space-y-2 divide-y-4">
|
||||
{props.chat.filter((m: ChatMsg) => m.role != "system").map((
|
||||
m: ChatMsg,
|
||||
) => <Msg msg={m} />)}
|
||||
</div>
|
||||
<div ref={bottomRef} />
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function Msg(props: { msg: ChatMsg }) {
|
||||
return (
|
||||
<div className="Messege text-lg">
|
||||
<span className="font-bold">
|
||||
{props.msg.role.toUpperCase()}:
|
||||
</span>
|
||||
<br />
|
||||
<span className="ml-8">
|
||||
{props.msg.content}
|
||||
</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function Controls(props: { setChatStateFn: any; chat: Array[ChatMsg] }) {
|
||||
function App() {
|
||||
const [recordState, setRecordState] = useState(false);
|
||||
const [chatState, setChatState] = useState([{
|
||||
role: "system",
|
||||
content: "You are a helpful assistant.",
|
||||
}]);
|
||||
|
||||
function toggleRecord() {
|
||||
if (recordState == false) {
|
||||
@@ -121,65 +70,40 @@ function Controls(props: { setChatStateFn: any; chat: Array[ChatMsg] }) {
|
||||
}
|
||||
|
||||
function sendAudio() {
|
||||
var formData = new FormData();
|
||||
formData.append("audio", new Blob(audioBlobs, { type: "audio/webm" }));
|
||||
fetch("http://100.82.51.22:8001/get-text", {
|
||||
let formData = new FormData();
|
||||
let audio = new Blob(audioBlobs, { type: "audio/webm" });
|
||||
userAudio.push(audio);
|
||||
formData.append("audio", audio);
|
||||
fetch("/get-text", {
|
||||
"method": "POST",
|
||||
"body": formData,
|
||||
}).then((res) => res.json())
|
||||
.then((res) => {
|
||||
console.log(res);
|
||||
props.setChatStateFn((curState) => [
|
||||
setChatState((curState: Array<ChatMsg>) => [
|
||||
...curState,
|
||||
{ "role": "user", "content": res["user-transcript"] },
|
||||
{
|
||||
"role": "user",
|
||||
"content": res["user-transcript"],
|
||||
"audio": URL.createObjectURL(userAudio[userAudio.length - 1]),
|
||||
},
|
||||
]);
|
||||
fetch("http://100.82.51.22:8001/conversation", {
|
||||
fetch("/conversation", {
|
||||
"method": "POST",
|
||||
"body": JSON.stringify([...props.chat, {
|
||||
"body": JSON.stringify([...chatState, {
|
||||
"role": "user",
|
||||
"content": res["user-transcript"],
|
||||
}]),
|
||||
}).then((res) => res.json())
|
||||
.then((res) => {
|
||||
props.setChatStateFn((curState) => [...curState, res]);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="controls self-center flex justify-evenly p-5 text-5xl border-2 border-b-0 w-1/2 max-w-screen-sm min-w-fit">
|
||||
<button
|
||||
onClick={() => toggleRecord()}
|
||||
className={"inline-flex " + (recordState ? "text-red-500" : "")}
|
||||
>
|
||||
{recordState ? <TbPlayerStop /> : <TbMicrophone2 />}
|
||||
{recordState ? "STOP" : "REC"}
|
||||
</button>
|
||||
|
||||
<button
|
||||
onClick={() => playRecord()}
|
||||
className="inline-flex text-green-500"
|
||||
>
|
||||
<TbPlayerPlay /> PLAY
|
||||
</button>
|
||||
|
||||
<button
|
||||
onClick={() => {
|
||||
sendAudio();
|
||||
}}
|
||||
className="inline-flex"
|
||||
>
|
||||
<TbBrandOpenai /> SEND
|
||||
</button>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function App() {
|
||||
const [chatState, setChatState] = useState([{
|
||||
role: "system",
|
||||
content: "You are a helpful assistant.",
|
||||
setChatState((
|
||||
curState: Array<ChatMsg>,
|
||||
) => [...curState, {
|
||||
...res,
|
||||
"audio": "/speak?" + new URLSearchParams({ text: res.content }),
|
||||
}]);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
@@ -189,7 +113,12 @@ function App() {
|
||||
<hr className="mx-3 border-t-4" />
|
||||
</div>
|
||||
<Feed chat={chatState} setChatStateFn={setChatState} />
|
||||
<Controls setChatStateFn={setChatState} chat={chatState} />
|
||||
<Controls
|
||||
recButtonOnClick={toggleRecord}
|
||||
recordState={recordState}
|
||||
playButtonOnClick={playRecord}
|
||||
sendButtonOnClick={sendAudio}
|
||||
/>
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
|
||||
106
speech-speech/frontend/src/components.tsx
Normal file
106
speech-speech/frontend/src/components.tsx
Normal file
@@ -0,0 +1,106 @@
|
||||
import { useEffect, useRef } from "react";
|
||||
import {
|
||||
TbBrandOpenai,
|
||||
TbMicrophone2,
|
||||
TbPlayerPlay,
|
||||
TbPlayerStop,
|
||||
} from "react-icons/tb";
|
||||
|
||||
export type ChatMsg = {
|
||||
role: string;
|
||||
content: string;
|
||||
audio?: string;
|
||||
};
|
||||
|
||||
export function Header() {
|
||||
return (
|
||||
<header className="header p-3">
|
||||
<div className="title text-5xl font-extrabold">
|
||||
Speach to Speech AI example
|
||||
</div>
|
||||
</header>
|
||||
);
|
||||
}
|
||||
|
||||
export function Feed(props: { chat: Array<ChatMsg>; setChatStateFn: any }) {
|
||||
const bottomRef = useRef<any>(null);
|
||||
|
||||
const scrollToBottom = () => {
|
||||
if (bottomRef.current) {
|
||||
bottomRef.current.scrollIntoView({ behavior: "smooth" });
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
scrollToBottom();
|
||||
console.log("scroll?");
|
||||
});
|
||||
|
||||
return (
|
||||
<div className="feed grow self-center w-5/6 max-w-screen-lg px-6 py-3 overflow-scroll">
|
||||
<div className="content-center space-y-2 divide-y-4">
|
||||
{props.chat.filter((m: ChatMsg) => m.role != "system").map((
|
||||
m: ChatMsg,
|
||||
i: number,
|
||||
) => <Msg key={i} msg={m} />)}
|
||||
</div>
|
||||
<div ref={bottomRef} />
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export function Msg(props: { msg: ChatMsg }) {
|
||||
return (
|
||||
<div className="Messege text-lg">
|
||||
<span className="font-bold">
|
||||
{props.msg.role.toUpperCase()}:
|
||||
</span>
|
||||
<br />
|
||||
<span className="ml-8">
|
||||
{props.msg.content}
|
||||
</span>
|
||||
<audio
|
||||
controls
|
||||
autoPlay={props.msg.role == "assistant"}
|
||||
src={props.msg.audio}
|
||||
/>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export function Controls(
|
||||
props: {
|
||||
recButtonOnClick: Function;
|
||||
recordState: Boolean;
|
||||
playButtonOnClick: Function;
|
||||
sendButtonOnClick: Function;
|
||||
},
|
||||
) {
|
||||
return (
|
||||
<div className="controls self-center flex justify-evenly p-5 text-5xl border-2 border-b-0 w-1/2 max-w-screen-sm min-w-fit">
|
||||
<button
|
||||
onClick={() => props.recButtonOnClick()}
|
||||
className={"inline-flex " + (props.recordState ? "text-red-500" : "")}
|
||||
>
|
||||
{props.recordState ? <TbPlayerStop /> : <TbMicrophone2 />}
|
||||
{props.recordState ? "STOP" : "REC"}
|
||||
</button>
|
||||
|
||||
<button
|
||||
onClick={() => props.playButtonOnClick()}
|
||||
className="inline-flex text-green-500"
|
||||
>
|
||||
<TbPlayerPlay /> PLAY
|
||||
</button>
|
||||
|
||||
<button
|
||||
onClick={() => {
|
||||
props.sendButtonOnClick();
|
||||
}}
|
||||
className="inline-flex"
|
||||
>
|
||||
<TbBrandOpenai /> SEND
|
||||
</button>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -9,4 +9,7 @@ export default defineConfig({
|
||||
"Access-Control-Allow-Origin": '*',
|
||||
},
|
||||
},
|
||||
build: {
|
||||
outDir: '../backend/dist/',
|
||||
},
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user