cleanup

audio playback for both user and assistant
assistant response is replayable
2024-02-28 12:10:49 -05:00 · 2024-02-28 11:55:34 -05:00 · 2024-02-28 11:33:08 -05:00 · 2024-02-26 20:21:17 -05:00 · 2024-02-26 20:03:58 -05:00 · 2024-02-26 19:58:28 -05:00
11 changed files with 260 additions and 158 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,30 @@
 **/.venv/
 **/__pycache__/
+**/audio
+*.mp3
+*.webm
+.env
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
--- a/README.md
+++ b/README.md
@@ -1,2 +1,14 @@
 # ai_sandbox

+A learning arena to learn about the current AI tool landscape
+
+## Subprojects
+
+### [Speech to Speech AI Assistant](./speech-speech/)
+AI assistant chat with speech recognition and tts responses
+
+Fullstack  
+- Vite, TS, React frontend
+- fastapi backend
+- OpenAI for LLM services
+
--- a/speech-speech/.gitignore
+++ b/speech-speech/.gitignore
@@ -1,24 +0,0 @@
-# Logs
-logs
-*.log
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-pnpm-debug.log*
-lerna-debug.log*
-
-node_modules
-dist
-dist-ssr
-*.local
-
-# Editor directories and files
-.vscode/*
-!.vscode/extensions.json
-.idea
-.DS_Store
-*.suo
-*.ntvs*
-*.njsproj
-*.sln
-*.sw?
--- a/speech-speech/README.md
+++ b/speech-speech/README.md
@@ -0,0 +1,35 @@
+# Speech to Speech AI Assistant
+AI assistant chat with speech recognition and tts responses
+
+Fullstack  
+- Vite, TS, React frontend
+- fastapi backend
+- OpenAI for LLM services
+
+## Requirements
+- python3
+- npm
+- OpenAI API token
+
+## Setup
+```
+cd frontend
+npm install
+npm run build
+
+cd ../backend
+# optionally setup virtual environment of your choice
+python3 -m pip install -r requirements.txt
+```
+
+# Running 
+example `backend/.env`
+```
+OPEN_API_KEY=<apikey>
+```
+
+```
+cd backend
+source .env
+uvicorn --port 8080 api:app
+```
--- a/speech-speech/backend/.env
+++ b/speech-speech/backend/.env
@@ -1 +0,0 @@
-OPENAI_API_KEY=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
--- a/speech-speech/backend/api.py
+++ b/speech-speech/backend/api.py
@@ -1,12 +1,20 @@
 from openai import OpenAI
 from fastapi import FastAPI, File, Response, Request
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-import whisper
+from io import BytesIO


 app = FastAPI()
 openAI_clinet = OpenAI()
-model = whisper.load_model("base")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)


 class ConversationMessege(BaseModel):
@@ -19,26 +27,20 @@ class Conversation(BaseModel):


@app.post("/get-text")
-def get_text(response: Response, audio: bytes = File()):
-    response.headers["Access-Control-Allow-Origin"] = "*"
-    with open("audio", "wb") as f:
-        f.write(audio)
-    # transcript = openAI_clinet.audio.transcriptions.create(
-    #    model="whisper-1",
-    #    file=audio,
-    #    response_format="text",
-    #    RequestBody
-    # )
-    result = model.transcribe("audio")
-    data = {"len": len(audio), "user-transcript": result["text"]}
+async def stt(audio: bytes = File()):
+    with BytesIO(audio) as f:
+        f.name = "audio.mp3"
+        transcript = openAI_clinet.audio.transcriptions.create(
+            model="whisper-1",
+            file=f,
+            response_format="text",
+        )
+    data = {"len": len(audio), "user-transcript": transcript}
    return data


@app.post("/conversation")
-async def get_next_response(request: Request, response: Response):
-    response.headers["Access-Control-Allow-Origin"] = "*"
-    #role = "test"
-    #res_msg = "temp test response"
+async def get_next_response(request: Request):
    messages = await request.json()
    res = openAI_clinet.chat.completions.create(
        model="gpt-3.5-turbo",
@@ -49,3 +51,16 @@ async def get_next_response(request: Request, response: Response):
    print(messages)
    print(res_msg)
    return {"role": role, "content": res_msg}
+
+
+@app.get("/speak")
+def tts(text: str):
+    res = openAI_clinet.audio.speech.create(
+        model="tts-1", voice="nova", input=text, response_format="mp3"
+    )
+    return Response(content=res.content, media_type="audio/mp3")
+
+
+# if this is above other routes it will try and serve files instead of matching
+# the intended route
+app.mount("/", StaticFiles(directory="dist", html=True), name="static")
--- a/speech-speech/backend/audio
+++ b/speech-speech/backend/audio
--- a/speech-speech/frontend/index.html
+++ b/speech-speech/frontend/index.html
@@ -2,9 +2,8 @@
 <html lang="en">
  <head>
    <meta charset="UTF-8" />
-    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>Vite + React + TS</title>
+    <title>Speach to Speech AI example</title>
  </head>
  <body>
    <div id="root"></div>
--- a/speech-speech/frontend/src/App.tsx
+++ b/speech-speech/frontend/src/App.tsx
@@ -1,47 +1,24 @@
-import { useEffect, useRef, useState } from "react";
-import {
-  TbBrandOpenai,
-  TbMicrophone2,
-  TbPlayerPlay,
-  TbPlayerStop,
-} from "react-icons/tb";
+import { useState } from "react";
+import { ChatMsg, Controls, Feed, Header } from "./components.tsx";
 import "./App.css";

-type ChatMsg = {
-  role: string;
-  content: string;
-};
-
-function Header() {
-  return (
-    <header className="header p-3">
-      <div className="title text-5xl font-extrabold">
-        Speach to Speech AI example
-      </div>
-    </header>
-  );
-}
-
-let audioBlobs = [];
+let userAudio: Array<Blob> = [];
+let audioBlobs: Array<Blob> = [];
 let streamBeingCaptured: MediaStream | null = null;
 let mediaRecorder: MediaRecorder | null = null;
-let chat: Array<ChatMsg> = [{
-  role: "system",
-  content: "You are a helpful assistant.",
-}];

 function get_mic() {
  if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
    console.log("getUserMedia supported.");
    return navigator.mediaDevices.getUserMedia({ audio: true });
-  } else {
-    console.log("getUserMedia not supported on your browser!");
  }
+  throw "getUserMedia not supported on your browser!";
 }

 function startRecord() {
  audioBlobs = [];
  get_mic().then((stream) => {
+    console.log("got mic");
    streamBeingCaptured = stream;
    mediaRecorder = new MediaRecorder(stream);
    console.log("Starting Recording");
@@ -53,6 +30,12 @@ function startRecord() {
 }

 function stopRecord() {
+  if (!mediaRecorder) {
+    throw "MediaRecorder not set";
+  }
+  if (!streamBeingCaptured) {
+    throw "Stream not set";
+  }
  mediaRecorder.stop();
  streamBeingCaptured.getTracks()
    .forEach((track) => track.stop());
@@ -69,46 +52,12 @@ function playRecord() {
  audio.play();
 }

-function Feed(props: { chat: Array[ChatMsg]; setChatStateFn: any }) {
-  const bottomRef = useRef(null);
-
-  const scrollToBottom = () => {
-    bottomRef.current?.scrollIntoView({ behavior: "smooth" });
-  };
-
-  useEffect(() => {
-    scrollToBottom();
-    console.log("scroll?");
-  });
-
-  return (
-    <div className="feed grow self-center w-5/6 max-w-screen-lg px-6 py-3 overflow-scroll">
-      <div className="content-center  space-y-2 divide-y-4">
-        {props.chat.filter((m: ChatMsg) => m.role != "system").map((
-          m: ChatMsg,
-        ) => <Msg msg={m} />)}
-      </div>
-      <div ref={bottomRef} />
-    </div>
-  );
-}
-
-function Msg(props: { msg: ChatMsg }) {
-  return (
-    <div className="Messege text-lg">
-      <span className="font-bold">
-        {props.msg.role.toUpperCase()}:
-      </span>
-      <br />
-      <span className="ml-8">
-        {props.msg.content}
-      </span>
-    </div>
-  );
-}
-
-function Controls(props: { setChatStateFn: any; chat: Array[ChatMsg] }) {
+function App() {
  const [recordState, setRecordState] = useState(false);
+  const [chatState, setChatState] = useState([{
+    role: "system",
+    content: "You are a helpful assistant.",
+  }]);

  function toggleRecord() {
    if (recordState == false) {
@@ -121,65 +70,40 @@ function Controls(props: { setChatStateFn: any; chat: Array[ChatMsg] }) {
  }

  function sendAudio() {
-    var formData = new FormData();
-    formData.append("audio", new Blob(audioBlobs, { type: "audio/webm" }));
-    fetch("http://100.82.51.22:8001/get-text", {
+    let formData = new FormData();
+    let audio = new Blob(audioBlobs, { type: "audio/webm" });
+    userAudio.push(audio);
+    formData.append("audio", audio);
+    fetch("/get-text", {
      "method": "POST",
      "body": formData,
    }).then((res) => res.json())
      .then((res) => {
-        console.log(res);
-        props.setChatStateFn((curState) => [
+        setChatState((curState: Array<ChatMsg>) => [
          ...curState,
-          { "role": "user", "content": res["user-transcript"] },
+          {
+            "role": "user",
+            "content": res["user-transcript"],
+            "audio": URL.createObjectURL(userAudio[userAudio.length - 1]),
+          },
        ]);
-        fetch("http://100.82.51.22:8001/conversation", {
+        fetch("/conversation", {
          "method": "POST",
-          "body": JSON.stringify([...props.chat, {
+          "body": JSON.stringify([...chatState, {
            "role": "user",
            "content": res["user-transcript"],
          }]),
        }).then((res) => res.json())
          .then((res) => {
-            props.setChatStateFn((curState) => [...curState, res]);
-          });
-      });
-  }
-
-  return (
-    <div className="controls self-center flex justify-evenly p-5 text-5xl border-2 border-b-0 w-1/2 max-w-screen-sm min-w-fit">
-      <button
-        onClick={() => toggleRecord()}
-        className={"inline-flex " + (recordState ? "text-red-500" : "")}
-      >
-        {recordState ? <TbPlayerStop /> : <TbMicrophone2 />}
-        {recordState ? "STOP" : "REC"}
-      </button>
-
-      <button
-        onClick={() => playRecord()}
-        className="inline-flex text-green-500"
-      >
-        <TbPlayerPlay /> PLAY
-      </button>
-
-      <button
-        onClick={() => {
-          sendAudio();
-        }}
-        className="inline-flex"
-      >
-        <TbBrandOpenai /> SEND
-      </button>
-    </div>
-  );
-}
-
-function App() {
-  const [chatState, setChatState] = useState([{
-    role: "system",
-    content: "You are a helpful assistant.",
+            setChatState((
+              curState: Array<ChatMsg>,
+            ) => [...curState, {
+              ...res,
+              "audio": "/speak?" + new URLSearchParams({ text: res.content }),
            }]);
+          });
+      });
+  }

  return (
    <>
@@ -189,7 +113,12 @@ function App() {
          <hr className="mx-3 border-t-4" />
        </div>
        <Feed chat={chatState} setChatStateFn={setChatState} />
-        <Controls setChatStateFn={setChatState} chat={chatState} />
+        <Controls
+          recButtonOnClick={toggleRecord}
+          recordState={recordState}
+          playButtonOnClick={playRecord}
+          sendButtonOnClick={sendAudio}
+        />
      </div>
    </>
  );
--- a/speech-speech/frontend/src/components.tsx
+++ b/speech-speech/frontend/src/components.tsx
@@ -0,0 +1,106 @@
+import { useEffect, useRef } from "react";
+import {
+  TbBrandOpenai,
+  TbMicrophone2,
+  TbPlayerPlay,
+  TbPlayerStop,
+} from "react-icons/tb";
+
+export type ChatMsg = {
+  role: string;
+  content: string;
+  audio?: string;
+};
+
+export function Header() {
+  return (
+    <header className="header p-3">
+      <div className="title text-5xl font-extrabold">
+        Speach to Speech AI example
+      </div>
+    </header>
+  );
+}
+
+export function Feed(props: { chat: Array<ChatMsg>; setChatStateFn: any }) {
+  const bottomRef = useRef<any>(null);
+
+  const scrollToBottom = () => {
+    if (bottomRef.current) {
+      bottomRef.current.scrollIntoView({ behavior: "smooth" });
+    }
+  };
+
+  useEffect(() => {
+    scrollToBottom();
+    console.log("scroll?");
+  });
+
+  return (
+    <div className="feed grow self-center w-5/6 max-w-screen-lg px-6 py-3 overflow-scroll">
+      <div className="content-center  space-y-2 divide-y-4">
+        {props.chat.filter((m: ChatMsg) => m.role != "system").map((
+          m: ChatMsg,
+          i: number,
+        ) => <Msg key={i} msg={m} />)}
+      </div>
+      <div ref={bottomRef} />
+    </div>
+  );
+}
+
+export function Msg(props: { msg: ChatMsg }) {
+  return (
+    <div className="Messege text-lg">
+      <span className="font-bold">
+        {props.msg.role.toUpperCase()}:
+      </span>
+      <br />
+      <span className="ml-8">
+        {props.msg.content}
+      </span>
+      <audio
+        controls
+        autoPlay={props.msg.role == "assistant"}
+        src={props.msg.audio}
+      />
+    </div>
+  );
+}
+
+export function Controls(
+  props: {
+    recButtonOnClick: Function;
+    recordState: Boolean;
+    playButtonOnClick: Function;
+    sendButtonOnClick: Function;
+  },
+) {
+  return (
+    <div className="controls self-center flex justify-evenly p-5 text-5xl border-2 border-b-0 w-1/2 max-w-screen-sm min-w-fit">
+      <button
+        onClick={() => props.recButtonOnClick()}
+        className={"inline-flex " + (props.recordState ? "text-red-500" : "")}
+      >
+        {props.recordState ? <TbPlayerStop /> : <TbMicrophone2 />}
+        {props.recordState ? "STOP" : "REC"}
+      </button>
+
+      <button
+        onClick={() => props.playButtonOnClick()}
+        className="inline-flex text-green-500"
+      >
+        <TbPlayerPlay /> PLAY
+      </button>
+
+      <button
+        onClick={() => {
+          props.sendButtonOnClick();
+        }}
+        className="inline-flex"
+      >
+        <TbBrandOpenai /> SEND
+      </button>
+    </div>
+  );
+}
--- a/speech-speech/frontend/vite.config.ts
+++ b/speech-speech/frontend/vite.config.ts
@@ -9,4 +9,7 @@ export default defineConfig({
      "Access-Control-Allow-Origin": '*',
    },
  },
+	build: {
+		outDir: '../backend/dist/',
+	},
 });
Author	SHA1	Message	Date
Andrei Stoica	8aa7bd2e99	cleanup	2024-02-28 12:10:49 -05:00
Andrei Stoica	3c0a9b150b	audio playback for both user and assistant	2024-02-28 11:55:34 -05:00
Andrei Stoica	5cc002a110	assistant response is replayable	2024-02-28 11:33:08 -05:00
Andrei Stoica	7562778f18	merged gitignores	2024-02-26 20:21:17 -05:00
Andrei Stoica	4fd825e1ae	fixing urls	2024-02-26 20:03:58 -05:00
Andrei Stoica	7acdbb3136	cleaning up readmes	2024-02-26 19:58:28 -05:00
Andrei Stoica	f1c2108bc7	updating readmes	2024-02-26 19:43:10 -05:00
Andrei Stoica	056e1067f4	building frontend and serving with fastapi	2024-02-26 13:17:26 -05:00
Andrei Stoica	1916185f19	cleaning up backend sins now	2024-02-26 11:54:00 -05:00
Andrei Stoica	64bb9f9db3	cleaning up my sins	2024-02-26 11:31:01 -05:00
Andrei Stoica	42c605d992	cleanup	2024-02-25 17:47:25 -05:00
Andrei Stoica	b7787be635	file response streamlining	2024-02-25 17:09:35 -05:00
Andrei Stoica	ebcfa7e19e	playing back response	2024-02-25 13:40:08 -05:00
Andrei Stoica	baab95660b	cors	2024-02-24 17:58:26 -05:00
Andrei Stoica	8af852d82c	tts backend	2024-02-20 19:35:36 -05:00
				`@@ -1 +0,0 @@`
				`OPENAI_API_KEY=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`