Alex Merchen
Published © GPL3+

Analyzing Files With Llama 3 Locally On Your PC

Analyze documents locally on your PC and use a large language model so you can ask questions and get answers.

IntermediateFull instructions provided3 hours1,249
Analyzing Files With Llama 3 Locally On Your PC

Things used in this project

Hardware components

AMD Radeon™ PRO W7900
×1

Software apps and online services

AMD Software: Pro Edition
Anaconda
Tesseract

Story

Read more

Code

PDFs to Text

Python
import os
import PyPDF2

def extract_text_from_pdfs(pdf_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over all PDF files in the specified folder
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            pdf_reader = PyPDF2.PdfReader(pdf_path)

            # Create a filename for the output text file
            output_filename = f"{os.path.splitext(filename)[0]}.txt"
            output_path = os.path.join(output_folder, output_filename)

            # Open the output text file in write mode
            with open(output_path, "w", encoding="utf-8") as text_file:
                # Extract text from each page and write it to the output file
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text = page.extract_text()

                    # Write a header for each page
                    text_file.write(f"\n\n--- {filename}, Page {page_num + 1} ---\n\n")
                    text_file.write(text)

                    print(f"Extracted text from {filename}, page {page_num + 1}")

if __name__ == "__main__":
    pdf_folder = "PDFs"  # Folder containing the PDF files
    output_folder = "TextFiles"  # Folder where the text files will be saved
    extract_text_from_pdfs(pdf_folder, output_folder)

Handwritten Notes to Text

Python
import pytesseract
from PIL import Image

# Path to the image file
image_path = 'test1.png'

# Open the image file
image = Image.open(image_path)

# Use pytesseract to do OCR on the image
text = pytesseract.image_to_string(image)

# Print the extracted text
print(text)

Processing many Handwritten PDFs to Text

Python
import io
import fitz  # PyMuPDF
import pytesseract
from PIL import Image, ImageDraw, ImageFont
import cv2
import numpy as np
import os

def pdf_to_text_and_images(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    all_text = ""
    
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Perform OCR
        text = pytesseract.image_to_string(img)
        all_text += f"Page {page_number + 1}:\n{text}\n\n"
        
        # Get bounding boxes for each line
        boxes = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
        
        # Create a new image with double width
        new_img = Image.new('RGB', (img.width, img.height), color='white')
        new_img.paste(img, (0, 0))
        
        draw = ImageDraw.Draw(new_img)
        
        # Draw green boxes around detected text areas
        n_boxes = len(boxes['level'])
        for i in range(n_boxes):
            if boxes['text'][i].strip() and boxes['level'][i] == 5:  # level 5 corresponds to lines
                (x, y, w, h) = (boxes['left'][i], boxes['top'][i], boxes['width'][i], boxes['height'][i])
                draw.rectangle([x, y, x + w, y + h], outline='green')
        
        # Save the processed image
        output_image_path = os.path.join(output_folder, f'output_image_page_{page_number + 1}.jpg')
        new_img.save(output_image_path)
        
        print(f"Processed page {page_number + 1}")
    
    return all_text

# Usage
pdf_path = 'YOUR/PDF/FILE/NAME.pdf'
output_folder = 'YOUR/OUTPUT/FOLDER'

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process all pages of the PDF
extracted_text = pdf_to_text_and_images(pdf_path, output_folder)

# Save the entire extracted text to a single file
output_text_path = os.path.join(output_folder, 'full_extracted_text.txt')
with open(output_text_path, 'w', encoding='utf-8') as f:
    f.write(extracted_text)

print(f"Full extracted text saved to: {output_text_path}")

# Display the first processed image in the notebook
from IPython.display import Image
Image(filename=os.path.join(output_folder, 'output_image_page_1.jpg'))

Training on Local Information

Python
import warnings

# Suppress specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub")

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

# Load documents from the 'data' directory
documents = SimpleDirectoryReader("PDFs").load_data()

# Set the embedding model and LLM with specific settings
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.llm = Ollama(model="llama3", request_timeout=360.0)

# Create the index from the loaded documents
index = VectorStoreIndex.from_documents(
    documents,
)

Discussing with Local LLM

Python
query_engine = index.as_query_engine(streaming=True)
while True:
    
    question = input("Ask a question (or type 'exit' to quit): ")
    if question.lower() == 'exit':
        break
    print("\n")
    streaming_response = query_engine.query(question)
    streaming_response.print_response_stream()
    print("\n")

Credits

Alex Merchen
24 projects • 40 followers
I'm an EE with a Masters in ECE. I like building things.
Contact

Comments

Please log in or sign up to comment.