Hackster is hosting Hackster Holidays, Finale: Livestream & Giveaway Drawing. Watch previous episodes or stream live on Tuesday!Stream Hackster Holidays, Finale on Tuesday!
goichi harada
Published

Running LLM on AMD NPU Hardware

I will port my LLM-based Japanese-English machine translation model to AMD's new RyzenAI enabled PC (with NPU).

AdvancedWork in progress5,316
Running LLM on AMD NPU Hardware

Things used in this project

Story

Read more

Code

llama-translate-amd-npu translation sample

Python
This is a sample code that uses llama-translate-amd-npu to translate. Please download the model checkpoint from below.
https://huggingface.co/dahara1/llama-translate-amd-npu
import torch
import psutil
import transformers
from transformers import AutoTokenizer, set_seed
import qlinear
import logging


def translation(instruction,  input):
    system =  """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a highly skilled professional translator. You are a native speaker of English, Japanese, French and Mandarin. Translate the given text accurately, taking into account the context and specific instructions provided. Steps may include hints enclosed in square brackets [] with the key and value separated by a colon:. If no additional instructions or context are provided, use your expertise to consider what the most appropriate context is and provide a natural translation that aligns with that context. When translating, strive to faithfully reflect the meaning and tone of the original text, pay attention to cultural nuances and differences in language usage, and ensure that the translation is grammatically correct and easy to read. For technical terms and proper nouns, either leave them in the original language or use appropriate translations as necessary. Take a deep breath, calm down, and start translating.<|eot_id|><|start_header_id|>user<|end_header_id|>"""

    prompt = f"""{system}
### Instruction:
{instruction}

### Input:
{input}

### Response:
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

    tokenized_input = tokenizer(prompt, return_tensors="pt",
        padding=True, max_length=1600, truncation=True)

    terminators = [
        tokenizer.eos_token_id,
    ]

    outputs = model.generate(tokenized_input['input_ids'],
            max_new_tokens=600,
            eos_token_id=terminators,
            attention_mask=tokenized_input['attention_mask'],
            do_sample=True,
            temperature=0.3,
            top_p=0.5)
    response = outputs[0][tokenized_input['input_ids'].shape[-1]:]
    response_message = tokenizer.decode(response, skip_special_tokens=True)
    return response_message


if __name__ == "__main__":


  transformers.logging.set_verbosity_error()
  logging.disable(logging.CRITICAL)

  set_seed(123)
  p = psutil.Process()
  p.cpu_affinity([0, 1, 2, 3])
  torch.set_num_threads(4)

  tokenizer = AutoTokenizer.from_pretrained("llama3.1-8b_translate-amd-npu")
  tokenizer.pad_token_id = tokenizer.add_special_tokens({'pad_token': '<|finetune_right_pad_id|>'})
  ckpt = r"llama-translate-amd-npu\llama3.1_8b_translate_w_bit_4_awq_amd.pt"

  model = torch.load(ckpt)
  model.eval()
  model = model.to(torch.bfloat16)

  for n, m in model.named_modules():
      if isinstance(m, qlinear.QLinearPerGrp):
          print(f"Preparing weights of layer : {n}")
          m.device = "aie"
          m.quantize_weights()



  print(translation("Translate Japanese to English.", "1月1日は日本の祝日です。その日は日曜日で、5日ぶりに雨が降りました"))
  print(translation("Translate English to Japanese.", "It’s raining cats and dogs."))
  print(translation("Translate French to Japanese.", "Après la pluie, le beau temps"))
  print(translation("Translate Mandarin to Japanese.", "要功夫深,铁杵磨成针"))

view_olympic_llama-translate.py

Python
Get information from the Olympic live update site and display it in Japanese, French and Chinese.
before start, download model from https://huggingface.co/dahara1/llama-translate-amd-npu.
import torch
import psutil
import transformers
from transformers import AutoTokenizer, set_seed
import qlinear
import logging

def translate(instruction,  input):
    system =  """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a highly skilled professional translator. You are a native speaker of English, Japanese, French and Mandarin. Translate the given text accurately, taking into account the context and specific instructions provided. Steps may include hints enclosed in square brackets [] with the key and value separated by a colon:. If no additional instructions or context are provided, use your expertise to consider what the most appropriate context is and provide a natural translation that aligns with that context. When translating, strive to faithfully reflect the meaning and tone of the original text, pay attention to cultural nuances and differences in language usage, and ensure that the translation is grammatically correct and easy to read. For technical terms and proper nouns, either leave them in the original language or use appropriate translations as necessary. Take a deep breath, calm down, and start translating.<|eot_id|><|start_header_id|>user<|end_header_id|>"""

    prompt = f"""{system}
### Instruction:
{instruction}

### Input:
{input}

### Response:
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

    tokenized_input = tokenizer(prompt, return_tensors="pt",
        padding=True, max_length=1600, truncation=True)

    terminators = [
        tokenizer.eos_token_id,
    ]

    outputs = model.generate(tokenized_input['input_ids'],
            max_new_tokens=600,
            eos_token_id=terminators,
            attention_mask=tokenized_input['attention_mask'],
            do_sample=True,
            temperature=0.3,
            top_p=0.5)
    response = outputs[0][tokenized_input['input_ids'].shape[-1]:]
    response_message = tokenizer.decode(response, skip_special_tokens=True)
    return response_message

set_seed(123)
p = psutil.Process()
p.cpu_affinity([0, 1, 2, 3])
torch.set_num_threads(4)
transformers.logging.set_verbosity_error()
logging.disable(logging.CRITICAL)

tokenizer = AutoTokenizer.from_pretrained("llama-translate-amd-npu")
tokenizer.pad_token_id = tokenizer.add_special_tokens({'pad_token': '<|finetune_right_pad_id|>'})
ckpt = r"llama-translate-amd-npu\llama3.1_8b_translate_w_bit_4_awq_amd.pt"

model = torch.load(ckpt)
model.eval()
model = model.to(torch.bfloat16)

for n, m in model.named_modules():
    if isinstance(m, qlinear.QLinearPerGrp):
        print(f"Preparing weights of layer : {n}")
        m.device = "aie"
        m.quantize_weights()

### end of llm setup.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

# Setup Selenium with Chrome driver
options = webdriver.ChromeOptions()
# Comment out the headless option to see the browser window
#options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
options.add_argument('--lang=en')

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# URL of the live updates page
url = "https://olympics.com/en/paris-2024/live-updates"

# Function to fetch and display the latest news
def fetch_latest_news():
    # Open the page
    driver.get(url)
    
    # Wait for the page to load completely
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
    except Exception as e:
        print("Page did not load in time:", e)
        driver.quit()
        exit()
    
    # Get the page source
    html_content = driver.page_source
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Find all news sections with the specified class
    news_sections = soup.find_all("div", class_="PostItem-styles__PostPart-sc-3a9e76ca-5 fFkcFP d3lb-post__part d3lb-post__part--text")
    
    # Extract and print the content of each news section
    print("Latest News Texts:")
    for section in news_sections:
        news_text = section.get_text(separator="\n", strip=True)
        print(f"news_text: {news_text}")
        print(translate("Translate English to Mandarin.", news_text), flush=True)
        print("-" * 40, flush=True)  # Separator between news items

# Initial page load and cookie acceptance
driver.get(url)

# Wait for the page to load completely
try:
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )
except Exception as e:
    print("Page did not load in time:", e)
    driver.quit()
    exit()

# Accept cookies if the banner appears
try:
    accept_cookies_button = driver.find_element(By.ID, "onetrust-accept-btn-handler")
    accept_cookies_button.click()
    print("Successfully clicked the 'Yes, I am happy' button.")
except Exception as e:
    print("Failed to click the 'Yes, I am happy' button:")

# Wait for the page to process the cookie acceptance
time.sleep(3)

# Periodically fetch and display the latest news
while True:
    fetch_latest_news()
    time.sleep(60)  # Wait for 1 minute before fetching the news again

ALMA-Ja-V3-amd-npu translation sample code

Python
This is a sample code that uses ALMA-Ja-V3-amd-npu to translate. Please download the model checkpoint from below.
https://huggingface.co/dahara1/ALMA-Ja-V3-amd-npu
import torch
import psutil
import transformers
from transformers import AutoTokenizer, set_seed
import qlinear
import logging


def translation(instruction,  input):
    system =  """You are a highly skilled professional Japanese-English and English-Japanese translator. Translate the given text accurately, taking into account the context and specific instructions provided. Steps may include hints enclosed in square brackets [] with the key and value separated by a colon:. Only when the subject is specified in the Japanese sentence, the subject will be added when translating into English. If no additional instructions or context are provided, use your expertise to consider what the most appropriate context is and provide a natural translation that aligns with that context. When translating, strive to faithfully reflect the meaning and tone of the original text, pay attention to cultural nuances and differences in language usage, and ensure that the translation is grammatically correct and easy to read. After completing the translation, review it once more to check for errors or unnatural expressions. For technical terms and proper nouns, either leave them in the original language or use appropriate translations as necessary. Take a deep breath, calm down, and start translating."""
    prompt = f"""{system}

### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

    tokenized_input = tokenizer(prompt, return_tensors="pt",
        padding=True, max_length=1600, truncation=True)

    terminators = [
        tokenizer.eos_token_id,
    ]

    outputs = model.generate(tokenized_input['input_ids'],
            max_new_tokens=600,
            eos_token_id=terminators,
            attention_mask=tokenized_input['attention_mask'],
            do_sample=True,
            temperature=0.3,
            top_p=0.5)
    response = outputs[0][tokenized_input['input_ids'].shape[-1]:]
    response_message = tokenizer.decode(response, skip_special_tokens=True)
    return response_message


if __name__ == "__main__":

  set_seed(123)
  p = psutil.Process()
  p.cpu_affinity([0, 1, 2, 3])
  torch.set_num_threads(4)
  transformers.logging.set_verbosity_error()
  logging.disable(logging.CRITICAL)

  tokenizer = AutoTokenizer.from_pretrained("ALMA-Ja-V3-amd-npu")
  tokenizer.pad_token = tokenizer.eos_token
  ckpt = r"ALMA-Ja-V3-amd-npu\alma_w_bit_4_awq_fa_amd.pt"

  model = torch.load(ckpt)
  model.eval()
  model = model.to(torch.bfloat16)
 

  for n, m in model.named_modules():
      if isinstance(m, qlinear.QLinearPerGrp):
          print(f"Preparing weights of layer : {n}")
          m.device = "aie"
          m.quantize_weights()


  print(translation("Translate Japanese to English.", "面白きこともなき世を面白く住みなすものは心なりけり"))
  print(translation("Translate English to Japanese.", "Join me, and together we can rule the galaxy as father and son."))

llama3.1-8b-Instruct-amd-npu sample code

Python
This is a sample code to run llama3.1-8b-Instruct-amd-npu using the NPU. Please download the model checkpoint from below.
https://huggingface.co/dahara1/llama3.1-8b-Instruct-amd-npu
import torch
import psutil
import transformers
from transformers import AutoTokenizer, set_seed
import qlinear
import logging

set_seed(123)
transformers.logging.set_verbosity_error()
logging.disable(logging.CRITICAL)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
]

message_list = [
    "Who are you? ", 
    # Japanese
    "あなたの乗っている船の名前は何ですか?英語ではなく全て日本語だけを使って返事をしてください",
    # Chainese
    "你经历过的最危险的冒险是什么?请用中文回答所有问题,不要用英文。",
    # French
    "À quelle vitesse va votre bateau ? Veuillez répondre uniquement en français et non en anglais.",
    # Korean
    "당신은 그 배의 어디를 좋아합니까? 영어를 사용하지 않고 모두 한국어로 대답하십시오.",
    # German
    "Wie würde Ihr Schiffsname auf Deutsch lauten? Bitte antwortet alle auf Deutsch statt auf Englisch.", 
    # Taiwanese
    "您發現過的最令人驚奇的寶藏是什麼?請僅使用台語和繁體中文回答,不要使用英文。",
]


if __name__ == "__main__":
    p = psutil.Process()
    p.cpu_affinity([0, 1, 2, 3])
    torch.set_num_threads(4)

    tokenizer = AutoTokenizer.from_pretrained("llama3.1-8b-Instruct-amd-npu")
    ckpt = r"llama3.1-8b-Instruct-amd-npu\llama3.1_8b_w_bit_4_awq_amd.pt"
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    model = torch.load(ckpt)
    model.eval()
    model = model.to(torch.bfloat16)

    for n, m in model.named_modules():
        if isinstance(m, qlinear.QLinearPerGrp):
            print(f"Preparing weights of layer : {n}")
            m.device = "aie"
            m.quantize_weights()

    print("system: " + messages[0]['content'])

    for i in range(len(message_list)):
        messages.append({"role": "user",  "content": message_list[i]})
        print("user: " + message_list[i])

        input = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True
        )

        outputs = model.generate(input['input_ids'],
        max_new_tokens=600,
            eos_token_id=terminators,
        attention_mask=input['attention_mask'],
            do_sample=True,
            temperature=0.6,
            top_p=0.9)

        response = outputs[0][input['input_ids'].shape[-1]:]
        response_message = tokenizer.decode(response, skip_special_tokens=True)
        print("assistant: " + response_message)
        messages.append({"role": "system", "content": response_message})

llama3-8b-amd-npu sample code

Python
This is a sample code to run llama3-8b-amd-npu using the NPU. Please download the model checkpoint from below.
https://huggingface.co/dahara1/llama3-8b-amd-npu
import torch
import time
import os
import psutil
import transformers
from transformers import AutoTokenizer, set_seed
import qlinear
import logging

set_seed(123)
transformers.logging.set_verbosity_error()
logging.disable(logging.CRITICAL)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
]

message_list = [
    "Who are you? ", 
    # Japanese
    "あなたの乗っている船の名前は何ですか?英語ではなく全て日本語だけを使って返事をしてください",
    # Chainese
    "你经历过的最危险的冒险是什么?请用中文回答所有问题,不要用英文。",
    # French
    "À quelle vitesse va votre bateau ? Veuillez répondre uniquement en français et non en anglais.",
    # Korean
    "당신은 그 배의 어디를 좋아합니까? 영어를 사용하지 않고 모두 한국어로 대답하십시오.",
    # German
    "Wie würde Ihr Schiffsname auf Deutsch lauten? Bitte antwortet alle auf Deutsch statt auf Englisch.", 
    # Taiwanese
    "您發現過的最令人驚奇的寶藏是什麼?請僅使用台語和繁體中文回答,不要使用英文。",
]


if __name__ == "__main__":
    p = psutil.Process()
    p.cpu_affinity([0, 1, 2, 3])
    torch.set_num_threads(4)

    tokenizer = AutoTokenizer.from_pretrained("llama3-8b-amd-npu")
    ckpt = "llama3-8b-amd-npu/pytorch_llama3_8b_w_bit_4_awq_amd.pt"
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    model = torch.load(ckpt)
    model.eval()
    model = model.to(torch.bfloat16)

    for n, m in model.named_modules():
        if isinstance(m, qlinear.QLinearPerGrp):
            print(f"Preparing weights of layer : {n}")
            m.device = "aie"
            m.quantize_weights()

    print("system: " + messages[0]['content'])

    for i in range(len(message_list)):
        messages.append({"role": "user",  "content": message_list[i]})
        print("user: " + message_list[i])

        input = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True
        )

        outputs = model.generate(input['input_ids'],
        max_new_tokens=600,
            eos_token_id=terminators,
        attention_mask=input['attention_mask'],
            do_sample=True,
            temperature=0.6,
            top_p=0.9)

        response = outputs[0][input['input_ids'].shape[-1]:]
        response_message = tokenizer.decode(response, skip_special_tokens=True)
        print("assistant: " + response_message)
        messages.append({"role": "system", "content": response_message})

Credits

goichi harada
1 project • 1 follower

Comments