Inspired by the Live LLAVA project, which uses a multimodal model to read real-time camera data and describe it using the LLAVA model, I realized the potential of using large multimodal models to extract information from videos and store it as a database. By incorporating Retrieval-Augmented Generation (RAG) functionality, users can interact with their own videos.
In our daily lives, we capture many joyful and significant moments, or record important company meetings. Over time, these videos accumulate, making it increasingly difficult to manage and extract relevant information from them. While you can use multimodal models like ChatGPT to help you understand video content, this approach risks leaking your information. This project provides a solution to this problem, protecting your privacy while allowing you to quickly access important information within your videos.
For this project, we used a short segment from this video as input, and used the reComputer J4012, which is equipped with a Jetson Orin NX 16GB, as the machine for LLAVA model inference due to its exceptional AI inference capabilities.
Step 1: Run Llava
To use the multimodal large model Llava wiht Ollama throughout the entire RAG process, the first step is to run Ollama and ensure that it is running llava. Here is the command you will run in terminal.
Install the Jetson-container tools:
git clone https://github.com/dusty-nv/jetson-containers
bash jetson-containers/install.s
Run Ollama container:
jetson-containers run --name ollama $(autotag ollama)
# Command below will input in ollama container
ollama run llava
Here is the result, this service will remain active throughout the project:
Step 2: Run Project
Open a new terminal(Ctrl+Alt+T) and input command below, this command will install the project:
cd jetson-containers && cd data
git clone https://github.com/Seeed-Projects/Multimodal-RAG-on-Jetson
Run l4t-pytorch container:
cd ..
jetson-containers run $(autotag l4t-pytorch)
cd data/Multimodel-RAG-on-Jetson
Here is the result:
run project:
bash run.sh
Result:Technical DetailsStep 1: Extract audio and pictures from video
Extract the audio from the video and divide it into segments, each lasting two seconds.
# Python3
def extract_audio(self):
video = VideoFileClip(os.path.join(self.video_path, "seeed.mp4"))
audio_part = video.audio
audio_part.write_audiofile(os.path.join(self.output_audio_path, "output_audio.mp3"))
def segment_audio(self):
audio = AudioSegment.from_mp3(os.path.join(self.output_audio_path, "output_audio.mp3"))
chunk_length_ms = 2000
chunks = make_chunks(audio, chunk_length_ms)
for i, chunk in enumerate(chunks):
chunk_name = os.path.join(self.output_audio_path, f"{i}.mp3")
chunk.export(chunk_name, format="mp3")
os.remove(os.path.join(self.output_audio_path, "output_audio.mp3"))
Extract pictures from the video every two seconds.
def extract_frames(self):
clip = VideoFileClip(os.path.join(self.video_path, "seeed.mp4"))
clip.write_images_sequence(os.path.join(self.image_path, "%04d.png"), fps=0.5)
Step 2: Convert audio to text and describe pictures to text
Use whisper to convert audio to text and add timestamp.
def extract_text(self):
model = whisper.load_model("base.en")
audio_text = ''
for filename in os.listdir(self.output_audio_path):
file_path = os.path.join(self.output_audio_path, filename)
result = model.transcribe(file_path)
time = int(filename[:-4]) * 2
audio_text += str(f'At time {time}s:') + result['text'] + '\n'
with open(os.path.join(self.text_path, "audio.md"), "w") as file:
file.write(audio_text)
file.close()
Use llava to describe images and add timestamp.
def image_to_text(self):
mm_model = OllamaMultiModal(model='llava', temperature=0)
image_file_names = self.get_image_path()
for image in image_file_names:
print(image)
time = 2*int(image[8:-4])
self.response += str(f'At time {time}s:')+ str(mm_model.complete(prompt='summarize the image and output as markdown format with one line', image_documents=[ImageDocument(image_path=image)])) + '\n'
with open(self.text_path+'image.md', 'w') as file:
file.write(self.response)
file.close()
Step3: Retrieve and Generation
llava will answer your questions based on the information in your video.
def reply(self):
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
llm = Ollama(model='llava', request_timeout=100, temperature=0)
Settings.llm = llm
Settings.embed_model = embed_model
data = SimpleDirectoryReader(text_path).load_data()
index = VectorStoreIndex.from_documents(data)
query_engine = index.as_query_engine(similarity_top_k=3)
while True:
try:
user_input = input('\033[94m' +"Prompt: " + '\033[0m')
response = query_engine.query(user_input)
print(response)
except KeyboardInterrupt:
break
ENDHere are all the dependency packages:
sudo apt update && sudo apt install ffmpeg -y
pip install pydub
pip install -U openai-whisper
pip install llama-index
pip install ftfy regex tqdm
pip install llama-index-multi-modal-llms-ollama
pip install llama-index-llms-ollama
pip install llama-index-llms-huggingface
pip install moviepy
pip install llama-index-embeddings-huggingface
pip install llama-index-vector-stores-lancedb
pip install llama-index-embeddings-clip
pip install git+https://github.com/openai/CLIP.git
Here is the whole code:
import os
from moviepy.editor import VideoFileClip
import whisper
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
from llama_index.core.schema import ImageDocument
from llama_index.multi_modal_llms.ollama import OllamaMultiModal
from pydub import AudioSegment
from pydub.utils import make_chunks
class VideoProcessor:
def __init__(self, video_path, output_audio_path, image_path, text_path):
self.video_path = video_path
self.output_audio_path = output_audio_path
self.image_path = image_path
self.text_path = text_path
def extract_audio(self):
video = VideoFileClip(os.path.join(self.video_path, "seeed.mp4"))
audio_part = video.audio
audio_part.write_audiofile(os.path.join(self.output_audio_path, "output_audio.mp3"))
def segment_audio(self):
audio = AudioSegment.from_mp3(os.path.join(self.output_audio_path, "output_audio.mp3"))
chunk_length_ms = 2000
chunks = make_chunks(audio, chunk_length_ms)
for i, chunk in enumerate(chunks):
chunk_name = os.path.join(self.output_audio_path, f"{i}.mp3")
chunk.export(chunk_name, format="mp3")
os.remove(os.path.join(self.output_audio_path, "output_audio.mp3"))
def extract_text(self):
model = whisper.load_model("base.en")
audio_text = ''
for filename in os.listdir(self.output_audio_path):
file_path = os.path.join(self.output_audio_path, filename)
result = model.transcribe(file_path)
time = int(filename[:-4]) * 2
audio_text += str(f'At time {time}s:') + result['text'] + '\n'
with open(os.path.join(self.text_path, "audio.md"), "w") as file:
file.write(audio_text)
file.close()
def extract_frames(self):
clip = VideoFileClip(os.path.join(self.video_path, "seeed.mp4"))
clip.write_images_sequence(os.path.join(self.image_path, "%04d.png"), fps=0.5)
def process_video(self):
self.extract_audio()
self.segment_audio()
self.extract_text()
self.extract_frames()
class translate_image_to_text:
def __init__(self, image_path, text_path):
self.image_path = image_path
self.text_path = text_path
self.response = ''
def get_image_path(self):
image_folder = self.image_path
image_files = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if os.path.isfile(os.path.join(image_folder, f))]
return image_files
def image_to_text(self):
mm_model = OllamaMultiModal(model='llava', temperature=0)
image_file_names = self.get_image_path()
for image in image_file_names:
print(image)
time = 2*int(image[8:-4])
self.response += str(f'At time {time}s:')+ str(mm_model.complete(prompt='summarize the image and output as markdown format with one line', image_documents=[ImageDocument(image_path=image)])) + '\n'
with open(self.text_path+'image.md', 'w') as file:
file.write(self.response)
file.close()
def reply(self):
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
llm = Ollama(model='llava', request_timeout=100, temperature=0)
Settings.llm = llm
Settings.embed_model = embed_model
data = SimpleDirectoryReader(text_path).load_data()
index = VectorStoreIndex.from_documents(data)
query_engine = index.as_query_engine(similarity_top_k=3)
while True:
try:
user_input = input('\033[94m' +"Prompt: " + '\033[0m')
response = query_engine.query(user_input)
print(response)
except KeyboardInterrupt:
break
if __name__ == '__main__':
video_path = './video/'
output_audio_path = './audio/'
image_path = './image/'
text_path = './text/'
processor = VideoProcessor(video_path, output_audio_path, image_path, text_path)
processor.process_video()
text = translate_image_to_text(image_path=image_path, text_path=text_path)
text.image_to_text()
text.reply()
For more information check here.
Comments
Please log in or sign up to comment.