cleanup

audio playback for both user and assistant
assistant response is replayable
2024-02-28 12:10:49 -05:00 · 2024-02-28 11:55:34 -05:00 · 2024-02-28 11:33:08 -05:00 · 2024-02-26 20:21:17 -05:00 · 2024-02-26 20:03:58 -05:00 · 2024-02-26 19:58:28 -05:00
9 changed files with 236 additions and 157 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,28 @@
 **/audio
 *.mp3
 *.webm
 .env
 # Logs
 logs
 *.log
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
 pnpm-debug.log*
 lerna-debug.log*
 node_modules
 dist
 dist-ssr
 *.local
 # Editor directories and files
 .vscode/*
 !.vscode/extensions.json
 .idea
 .DS_Store
 *.suo
 *.ntvs*
 *.njsproj
 *.sln
 *.sw?
--- a/README.md
+++ b/README.md
@@ -1,2 +1,14 @@
 # ai_sandbox
 A learning arena to learn about the current AI tool landscape
 ## Subprojects
 ### [Speech to Speech AI Assistant](./speech-speech/)
 AI assistant chat with speech recognition and tts responses
 Fullstack  
 - Vite, TS, React frontend
 - fastapi backend
 - OpenAI for LLM services
--- a/speech-speech/.gitignore
+++ b/speech-speech/.gitignore
@@ -1,24 +0,0 @@
 # Logs
 logs
 *.log
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
 pnpm-debug.log*
 lerna-debug.log*
 node_modules
 dist
 dist-ssr
 *.local
 # Editor directories and files
 .vscode/*
 !.vscode/extensions.json
 .idea
 .DS_Store
 *.suo
 *.ntvs*
 *.njsproj
 *.sln
 *.sw?
--- a/speech-speech/README.md
+++ b/speech-speech/README.md
@@ -0,0 +1,35 @@
 # Speech to Speech AI Assistant
 AI assistant chat with speech recognition and tts responses
 Fullstack  
 - Vite, TS, React frontend
 - fastapi backend
 - OpenAI for LLM services
 ## Requirements
 - python3
 - npm
 - OpenAI API token
 ## Setup
 ```
 cd frontend
 npm install
 npm run build
 cd ../backend
 # optionally setup virtual environment of your choice
 python3 -m pip install -r requirements.txt
 ```
 # Running 
 example `backend/.env`
 ```
 OPEN_API_KEY=<apikey>
 ```
 ```
 cd backend
 source .env
 uvicorn --port 8080 api:app
 ```
--- a/speech-speech/backend/.env
+++ b/speech-speech/backend/.env
@@ -1 +0,0 @@
 OPENAI_API_KEY=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
--- a/speech-speech/backend/api.py
+++ b/speech-speech/backend/api.py
@@ -1,8 +1,9 @@
 from openai import OpenAI
 from fastapi import FastAPI, File, Response, Request
 from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
 from io import BytesIO
 app = FastAPI()
@@ -26,22 +27,20 @@ class Conversation(BaseModel):
@app.post("/get-text")
-def stt(audio: bytes = File()):
+async def stt(audio: bytes = File()):
-    with open("audio.webm", "wb+") as f:
+    with BytesIO(audio) as f:
-        f.write(audio)
+        f.name = "audio.mp3"
        transcript = openAI_clinet.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            response_format="text",
-         )
+        )
    data = {"len": len(audio), "user-transcript": transcript}
    return data
@app.post("/conversation")
 async def get_next_response(request: Request):
    # role = "test"
    # res_msg = "temp test response"
    messages = await request.json()
    res = openAI_clinet.chat.completions.create(
        model="gpt-3.5-turbo",
@@ -57,10 +56,11 @@ async def get_next_response(request: Request):
@app.get("/speak")
 def tts(text: str):
    res = openAI_clinet.audio.speech.create(
-        model="tts-1",
+        model="tts-1", voice="nova", input=text, response_format="mp3"
        voice="nova",
        input=text,
        response_format='mp3'
    )
    # this works for now but I need to find a way to stream this to response
    return Response(content=res.content, media_type="audio/mp3")
 # if this is above other routes it will try and serve files instead of matching
 # the intended route
 app.mount("/", StaticFiles(directory="dist", html=True), name="static")
--- a/speech-speech/frontend/src/App.tsx
+++ b/speech-speech/frontend/src/App.tsx
@@ -1,34 +1,24 @@
-import { useEffect, useRef, useState } from "react";
+import { useState } from "react";
-import {
+import { ChatMsg, Controls, Feed, Header } from "./components.tsx";
  TbBrandOpenai,
  TbMicrophone2,
  TbPlayerPlay,
  TbPlayerStop,
 } from "react-icons/tb";
 import "./App.css";
-type ChatMsg = {
+let userAudio: Array<Blob> = [];
-  role: string;
+let audioBlobs: Array<Blob> = [];
  content: string;
 };
 let audioBlobs = [];
 let streamBeingCaptured: MediaStream | null = null;
 let mediaRecorder: MediaRecorder | null = null;
 function get_mic() {
  if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
    console.log("getUserMedia supported.");
    return navigator.mediaDevices.getUserMedia({ audio: true });
  } else {
    console.log("getUserMedia not supported on your browser!");
  }
  throw "getUserMedia not supported on your browser!";
 }
 function startRecord() {
  audioBlobs = [];
  get_mic().then((stream) => {
    console.log("got mic");
    streamBeingCaptured = stream;
    mediaRecorder = new MediaRecorder(stream);
    console.log("Starting Recording");
@@ -40,6 +30,12 @@ function startRecord() {
 }
 function stopRecord() {
  if (!mediaRecorder) {
    throw "MediaRecorder not set";
  }
  if (!streamBeingCaptured) {
    throw "Stream not set";
  }
  mediaRecorder.stop();
  streamBeingCaptured.getTracks()
    .forEach((track) => track.stop());
@@ -52,67 +48,16 @@ function stopRecord() {
 function playRecord() {
  const audioBlob = new Blob(audioBlobs, { type: "audio/webm" });
  const audioUrl = URL.createObjectURL(audioBlob);
-  const audio =  new Audio(audioUrl);
+  const audio = new Audio(audioUrl);
  audio.play();
 }
-function playMsg(msg: ChatMsg) {
+function App() {
  const audio = new Audio("http://100.82.51.22:8001/speak?" + new URLSearchParams({text: msg.content}));
  console.log("loading audio and playing?")
  audio.play();
 }
 function Header() {
  return (
    <header className="header p-3">
      <div className="title text-5xl font-extrabold">
        Speach to Speech AI example
      </div>
    </header>
  );
 }
 function Feed(props: { chat: Array[ChatMsg]; setChatStateFn: any }) {
  const bottomRef = useRef(null);
  const scrollToBottom = () => {
    bottomRef.current?.scrollIntoView({ behavior: "smooth" });
  };
  useEffect(() => {
    scrollToBottom();
    console.log("scroll?");
  });
  return (
    <div className="feed grow self-center w-5/6 max-w-screen-lg px-6 py-3 overflow-scroll">
      <div className="content-center  space-y-2 divide-y-4">
        {props.chat.filter((m: ChatMsg) => m.role != "system").map((
          m: ChatMsg,
          i: number,
        ) => <Msg key={i} msg={m} />)}
      </div>
      <div ref={bottomRef} />
    </div>
  );
 }
 function Msg(props: { msg: ChatMsg }) {
  return (
    <div className="Messege text-lg">
      <span className="font-bold">
        {props.msg.role.toUpperCase()}:
      </span>
      <br />
      <span className="ml-8">
        {props.msg.content}
      </span>
    </div>
  );
 }
 function Controls(props: { setChatStateFn: any; chat: Array[ChatMsg] }) {
  const [recordState, setRecordState] = useState(false);
  const [chatState, setChatState] = useState([{
    role: "system",
    content: "You are a helpful assistant.",
  }]);
  function toggleRecord() {
    if (recordState == false) {
@@ -125,68 +70,41 @@ function Controls(props: { setChatStateFn: any; chat: Array[ChatMsg] }) {
  }
  function sendAudio() {
-    var formData = new FormData();
+    let formData = new FormData();
-    formData.append("audio", new Blob(audioBlobs, { type: "audio/webm" }));
+    let audio = new Blob(audioBlobs, { type: "audio/webm" });
-    fetch("http://100.82.51.22:8001/get-text", {
+    userAudio.push(audio);
    formData.append("audio", audio);
    fetch("/get-text", {
      "method": "POST",
      "body": formData,
    }).then((res) => res.json())
      .then((res) => {
-        console.log(res);
+        setChatState((curState: Array<ChatMsg>) => [
        props.setChatStateFn((curState) => [
          ...curState,
-          { "role": "user", "content": res["user-transcript"] },
+          {
            "role": "user",
            "content": res["user-transcript"],
            "audio": URL.createObjectURL(userAudio[userAudio.length - 1]),
          },
        ]);
-        fetch("http://100.82.51.22:8001/conversation", {
+        fetch("/conversation", {
          "method": "POST",
-          "body": JSON.stringify([...props.chat, {
+          "body": JSON.stringify([...chatState, {
            "role": "user",
            "content": res["user-transcript"],
          }]),
        }).then((res) => res.json())
          .then((res) => {
-            props.setChatStateFn((curState) => [...curState, res]);
+            setChatState((
-            console.log("attempting to play result")
+              curState: Array<ChatMsg>,
-            playMsg(res)
+            ) => [...curState, {
              ...res,
              "audio": "/speak?" + new URLSearchParams({ text: res.content }),
            }]);
          });
      });
  }
  return (
    <div className="controls self-center flex justify-evenly p-5 text-5xl border-2 border-b-0 w-1/2 max-w-screen-sm min-w-fit">
      <button
        onClick={() => toggleRecord()}
        className={"inline-flex " + (recordState ? "text-red-500" : "")}
      >
        {recordState ? <TbPlayerStop /> : <TbMicrophone2 />}
        {recordState ? "STOP" : "REC"}
      </button>
      <button
        onClick={() => playRecord()}
        className="inline-flex text-green-500"
      >
        <TbPlayerPlay /> PLAY
      </button>
      <button
        onClick={() => {
          sendAudio();
        }}
        className="inline-flex"
      >
        <TbBrandOpenai /> SEND
      </button>
    </div>
  );
 }
 function App() {
  const [chatState, setChatState] = useState([{
    role: "system",
    content: "You are a helpful assistant.",
  }]);
  return (
    <>
      <div className="h-screen center flex flex-col">
@@ -195,7 +113,12 @@ function App() {
          <hr className="mx-3 border-t-4" />
        </div>
        <Feed chat={chatState} setChatStateFn={setChatState} />
-        <Controls setChatStateFn={setChatState} chat={chatState} />
+        <Controls
          recButtonOnClick={toggleRecord}
          recordState={recordState}
          playButtonOnClick={playRecord}
          sendButtonOnClick={sendAudio}
        />
      </div>
    </>
  );
--- a/speech-speech/frontend/src/components.tsx
+++ b/speech-speech/frontend/src/components.tsx
@@ -0,0 +1,106 @@
 import { useEffect, useRef } from "react";
 import {
  TbBrandOpenai,
  TbMicrophone2,
  TbPlayerPlay,
  TbPlayerStop,
 } from "react-icons/tb";
 export type ChatMsg = {
  role: string;
  content: string;
  audio?: string;
 };
 export function Header() {
  return (
    <header className="header p-3">
      <div className="title text-5xl font-extrabold">
        Speach to Speech AI example
      </div>
    </header>
  );
 }
 export function Feed(props: { chat: Array<ChatMsg>; setChatStateFn: any }) {
  const bottomRef = useRef<any>(null);
  const scrollToBottom = () => {
    if (bottomRef.current) {
      bottomRef.current.scrollIntoView({ behavior: "smooth" });
    }
  };
  useEffect(() => {
    scrollToBottom();
    console.log("scroll?");
  });
  return (
    <div className="feed grow self-center w-5/6 max-w-screen-lg px-6 py-3 overflow-scroll">
      <div className="content-center  space-y-2 divide-y-4">
        {props.chat.filter((m: ChatMsg) => m.role != "system").map((
          m: ChatMsg,
          i: number,
        ) => <Msg key={i} msg={m} />)}
      </div>
      <div ref={bottomRef} />
    </div>
  );
 }
 export function Msg(props: { msg: ChatMsg }) {
  return (
    <div className="Messege text-lg">
      <span className="font-bold">
        {props.msg.role.toUpperCase()}:
      </span>
      <br />
      <span className="ml-8">
        {props.msg.content}
      </span>
      <audio
        controls
        autoPlay={props.msg.role == "assistant"}
        src={props.msg.audio}
      />
    </div>
  );
 }
 export function Controls(
  props: {
    recButtonOnClick: Function;
    recordState: Boolean;
    playButtonOnClick: Function;
    sendButtonOnClick: Function;
  },
 ) {
  return (
    <div className="controls self-center flex justify-evenly p-5 text-5xl border-2 border-b-0 w-1/2 max-w-screen-sm min-w-fit">
      <button
        onClick={() => props.recButtonOnClick()}
        className={"inline-flex " + (props.recordState ? "text-red-500" : "")}
      >
        {props.recordState ? <TbPlayerStop /> : <TbMicrophone2 />}
        {props.recordState ? "STOP" : "REC"}
      </button>
      <button
        onClick={() => props.playButtonOnClick()}
        className="inline-flex text-green-500"
      >
        <TbPlayerPlay /> PLAY
      </button>
      <button
        onClick={() => {
          props.sendButtonOnClick();
        }}
        className="inline-flex"
      >
        <TbBrandOpenai /> SEND
      </button>
    </div>
  );
 }
--- a/speech-speech/frontend/vite.config.ts
+++ b/speech-speech/frontend/vite.config.ts
@@ -9,4 +9,7 @@ export default defineConfig({
      "Access-Control-Allow-Origin": '*',
    },
  },
 	build: {
 		outDir: '../backend/dist/',
 	},
 });
Author	SHA1	Message	Date
Andrei Stoica	8aa7bd2e99	cleanup	2024-02-28 12:10:49 -05:00
Andrei Stoica	3c0a9b150b	audio playback for both user and assistant	2024-02-28 11:55:34 -05:00
Andrei Stoica	5cc002a110	assistant response is replayable	2024-02-28 11:33:08 -05:00
Andrei Stoica	7562778f18	merged gitignores	2024-02-26 20:21:17 -05:00
Andrei Stoica	4fd825e1ae	fixing urls	2024-02-26 20:03:58 -05:00
Andrei Stoica	7acdbb3136	cleaning up readmes	2024-02-26 19:58:28 -05:00
Andrei Stoica	f1c2108bc7	updating readmes	2024-02-26 19:43:10 -05:00
Andrei Stoica	056e1067f4	building frontend and serving with fastapi	2024-02-26 13:17:26 -05:00
Andrei Stoica	1916185f19	cleaning up backend sins now	2024-02-26 11:54:00 -05:00
Andrei Stoica	64bb9f9db3	cleaning up my sins	2024-02-26 11:31:01 -05:00
Andrei Stoica	42c605d992	cleanup	2024-02-25 17:47:25 -05:00
		`@@ -1 +0,0 @@`
			`OPENAI_API_KEY=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`