🎍Llama 3.2 model
Presentation Report: AI Model Training Project - BBO Developing
1. Project Introduction
2. Overview of the Pipeline Workflow
3. In-Depth Class-by-Class Pipeline Analysis
Class 1: Build_Data – Real-World Text Data Collection and Preprocessing
class Build_Data:
waiting_list = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_que_name = "valhalla/t5-base-qg-hl"
model_que = T5ForConditionalGeneration.from_pretrained(model_que_name).to(device)
tokenizer_que = T5Tokenizer.from_pretrained(model_que_name)
model_ans_name="deepset/roberta-large-squad2"
model_ans = AutoModelForQuestionAnswering.from_pretrained(model_ans_name).to(device)
tokenizer_ans = AutoTokenizer.from_pretrained(model_ans_name)
model_para_name="facebook/bart-large-cnn"
model_para = BartForConditionalGeneration.from_pretrained(model_para_name).to(device)
tokenizer_para = BartTokenizer.from_pretrained(model_para_name)
def __init__(self, Dic_contexts,epoch=1): # Tăng phạm vi token để có câu dài hơn
self.epoch=epoch
self.contexts = list(Dic_contexts.values())[0]
self.data=[]
self.Context_To_Data()
def Answering(self,context,question):
nlp = pipeline('question-answering', model=self.model_ans, tokenizer=self.tokenizer_ans)
context_chunks = self.chunk_text(context,self.tokenizer_ans,chunk_size=64)
answers = []
for chunk in context_chunks:
answer = nlp({'context': chunk, 'question': question})
answers.append(answer['answer'])
final_answer = ' '.join(answers[1:])
return self.paraphasing(final_answer)
def chunk_text(self,text,tokenizer,chunk_size=512):
tokens = tokenizer.encode(text)
chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
return [tokenizer.decode(chunk) for chunk in chunks]
def paraphasing(self,answer):
nlp = pipeline('text2text-generation', model=self.model_para, tokenizer=self.tokenizer_para)
# Generating a more coherent version
output = nlp(f"paraphrase: {answer}")
# Print the improved sentence
return output[0]['generated_text']
def Context_To_Data(self):
for i in range(self.epoch):
for index, context in enumerate(self.contexts):
print(f"\rTiến độ sinh Data: {index+1}/{len(self.contexts)}", end="", flush=True)
question=self.Questioning(context)
answer=self.Answering(context,question)
self.data.append({"context":context,"question":question,"answer":answer})
def Questioning(self, context):
input_text = f"generate question: {context}"
input_ids = self.tokenizer_que.encode(input_text, return_tensors="pt").to(self.device)
outputs = self.model_que.generate(
input_ids,
max_length=300, # Tăng độ dài câu hỏi
do_sample=True,
top_k=40,
top_p=0.95,
temperature=0.6,
repetition_penalty=1.6,
num_return_sequences=1
)
question = self.tokenizer_que.decode(outputs[0], skip_special_tokens=True)
return question
def cal_token(self, text):
return sum(len(sentence.split()) for sentence in text)
1.1. Purpose and Role:
1.2. Internal Pipeline:
1.3. Output:
1.4. Advanced Processing Features:
Class 2: Generate_Number_Data – Synthetic Logic & Math Data Generator
2.1. Purpose:
2.2. Internal Workflow:
2.3. Pipeline Role:
Class 3: ChatDataset – Custom Dataset for HuggingFace Transformers

3.1. Purpose:
3.2. Internal Processing:
3.3. Integration:
3.4. Features for Fine-Tuning:
Class 4: BBO_Training – Model Training Orchestration

4.1. Objective:
CL4.2. Training Workflow:
CL4.3. Highlights:
4. Summary: Full Training Pipeline
5. Recommendations for Future Improvements: Strategic Shift to RAG, API Integration, and Automation
1. Transition from Fine-Tuning to Retrieval-Augmented Generation (RAG)
2. Leveraging API-Based Language Models Combined with RAG
3. Automating Deployment and Maintenance Using n8n
Last updated





