0% found this document useful (0 votes)

87 views5 pages

IndicTrans2 PDF to Punjabi Docx Conversion

Contains python script for translation.

Uploaded by

Pragit Sharma

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

87 views5 pages

IndicTrans2 PDF to Punjabi Docx Conversion

Contains python script for translation.

Uploaded by

Pragit Sharma

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

#Run

%%capture
!git clone [Link]

#Run
%%capture
%cd /content/IndicTrans2/huggingface_interface

#Run
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2
mosestokenizer
!python3 -c "import nltk; [Link]('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone [Link]

%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..

#Run
!pip install pdfplumber

Requirement already satisfied: pdfplumber in /usr/local/lib/python3.10/dist-

packages (0.11.0)
Requirement already satisfied: [Link]==20231228 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (20231228)
Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.10/dist-
packages (from pdfplumber) (9.4.0)
Requirement already satisfied: pypdfium2>=4.18.0 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (4.30.0)
Requirement already satisfied: charset-normalizer>=2.0.0 in
/usr/local/lib/python3.10/dist-packages (from [Link]==20231228->pdfplumber)
(3.3.2)
Requirement already satisfied: cryptography>=36.0.0 in
/usr/local/lib/python3.10/dist-packages (from [Link]==20231228->pdfplumber)
(42.0.7)
Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-
packages (from cryptography>=36.0.0->[Link]==20231228->pdfplumber) (1.16.0)
Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-
packages (from cffi>=1.12->cryptography>=36.0.0->[Link]==20231228-
>pdfplumber) (2.22)

#Run
!pip install python-docx

Requirement already satisfied: python-docx in /usr/local/lib/python3.10/dist-

packages (1.1.2)
Requirement already satisfied: lxml>=3.1.0 in /usr/local/lib/python3.10/dist-
packages (from python-docx) (4.9.4)
Requirement already satisfied: typing-extensions>=4.9.0 in
/usr/local/lib/python3.10/dist-packages (from python-docx) (4.12.1)

##Resart session

# Run
import pdfplumber
from docx import Document
from [Link] import Pt, Inches
from [Link] import qn
from [Link] import OxmlElement
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

#Run
def process_pdf(path):
with [Link](path) as pdf:
text_ = []
table = []
for page in [Link]:
text = page.extract_text()

if text:
text_.append(text)
for table in page.extract_tables():
table_summaries.append({"type": "table", "content": table})

final_output = text_ + table

return final_output

def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):

if quantization == "4-bit":
qconfig = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
elif quantization == "8-bit":
qconfig = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_use_double_quant=True,
bnb_8bit_compute_dtype=torch.bfloat16,
)
else:
qconfig = None

tokenizer = IndicTransTokenizer(direction=direction)
model = AutoModelForSeq2SeqLM.from_pretrained(
ckpt_dir,
trust_remote_code=True,
low_cpu_mem_usage=True,
quantization_config=qconfig,
)

if qconfig == None:
model = [Link](DEVICE)
if DEVICE == "cuda":
[Link]()

[Link]()

return tokenizer, model

def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
translations = []
for i in range(0, len(input_sentences), BATCH_SIZE):
batch = input_sentences[i : i + BATCH_SIZE]

# Preprocess the batch and extract entity mappings

batch = ip.preprocess_batch(batch, src_lang=src_lang,
tgt_lang=tgt_lang)

# Tokenize the batch and generate input encodings

inputs = tokenizer(
batch,
src=True,
truncation=True,
padding="longest",
return_tensors="pt",
return_attention_mask=True,
).to(DEVICE)

# Generate translations using the model

with torch.no_grad():
generated_tokens = [Link](
**inputs,
use_cache=True,
min_length=0,
max_length=256,
num_beams=5,
num_return_sequences=1,
)

# Decode the generated tokens into text

generated_tokens =
tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)

# Postprocess the translations, including entity replacement

translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

del inputs
[Link].empty_cache()

return translations

BATCH_SIZE = 4
DEVICE = "cuda" if [Link].is_available() else "cpu"
quantization = None
def main_fuc(path,filename):
text = process_pdf(path)
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "eng_Latn", "pan_Guru"

main_text = ''
for i in text:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B" #
ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model =
initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)

ip = IndicProcessor(inference=True)
en_sents = [i]
hi_translations = batch_translate(en_sents, src_lang, tgt_lang,
en_indic_model, en_indic_tokenizer, ip)
main_text += hi_translations[0]

# flush the models to free the GPU memory

del en_indic_tokenizer, en_indic_model
# Create a new Document
doc = Document()

# Set 1-inch margins

sections = [Link]
for section in sections:
section.top_margin = Inches(1)
section.bottom_margin = Inches(1)
section.left_margin = Inches(1)
section.right_margin = Inches(1)

# Add a paragraph
paragraph = doc.add_paragraph(str(main_text))

# Set font type and size

run = [Link][0]
[Link] = Pt(12)

# Use Raavi font

[Link] = 'Raavi'
r = run._element
[Link](qn('w:eastAsia'), 'Raavi')

# Save the document

[Link](filename+".docx")
print("complete")

# 1. Filepath 2. Filename
Filepath = "/content/2003_S1_278_306.pdf"
file = "2003_S1_278_306"

#Run
main_fuc(Filepath,"file")

#Output will be saved into docx

<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer

is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89:
UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab
([Link] set it as secret in your Google Colab and
restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access
public models or datasets.
[Link](
<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer
is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)

#process_pdf("/content/2006_1_138_148.pdf") # to see text from pdf

Def Set Random Seed (Seed)
No ratings yet
Def Set Random Seed (Seed)
29 pages
Claude Comparet DB
No ratings yet
Claude Comparet DB
8 pages
Video Processing with Decord Reader
No ratings yet
Video Processing with Decord Reader
7 pages
NLP
No ratings yet
NLP
15 pages
Alpaca + Llama-3 8b Full Example - Ipynb - Colab
No ratings yet
Alpaca + Llama-3 8b Full Example - Ipynb - Colab
10 pages
Fast Llama Training Guide
No ratings yet
Fast Llama Training Guide
5 pages
Causal Self-Attention in PyTorch
No ratings yet
Causal Self-Attention in PyTorch
10 pages
Next With Continuos Run
No ratings yet
Next With Continuos Run
4 pages
QA Using Gemini Langchain ChromaDB PDF
No ratings yet
QA Using Gemini Langchain ChromaDB PDF
2 pages
Intent Recognizer
No ratings yet
Intent Recognizer
5 pages
Wa0029.
No ratings yet
Wa0029.
11 pages
Hand On Day 2 Salinan - Dari - 2 - Using - Transformers
No ratings yet
Hand On Day 2 Salinan - Dari - 2 - Using - Transformers
10 pages
Tutorials Sources Beginner Ptcheat
No ratings yet
Tutorials Sources Beginner Ptcheat
7 pages
22BCE9752 NLPDigital Assignment 02
No ratings yet
22BCE9752 NLPDigital Assignment 02
21 pages
Code2pdf 67c73149b96ef
No ratings yet
Code2pdf 67c73149b96ef
4 pages
Project Source
No ratings yet
Project Source
21 pages
Experimental Pix2pix
No ratings yet
Experimental Pix2pix
5 pages
Vit32 GPTMD
No ratings yet
Vit32 GPTMD
6 pages
Image Captioning Model Development
No ratings yet
Image Captioning Model Development
9 pages
Run 1
No ratings yet
Run 1
57 pages
Working Setup MulTalk - Windows
No ratings yet
Working Setup MulTalk - Windows
2 pages
AI Lab6
No ratings yet
AI Lab6
22 pages
Pgi20s02j - Lab Record
No ratings yet
Pgi20s02j - Lab Record
24 pages
Research Paper Summarization
No ratings yet
Research Paper Summarization
13 pages
Karpathy MinGPT Model
No ratings yet
Karpathy MinGPT Model
7 pages
Easyocr
No ratings yet
Easyocr
8 pages
PDF Chatbot with LangChain Integration
No ratings yet
PDF Chatbot with LangChain Integration
2 pages
Chatbot Code
No ratings yet
Chatbot Code
2 pages
Retorno 1
No ratings yet
Retorno 1
29 pages
OpenAI CLIP vs Mask R-CNN Analysis
No ratings yet
OpenAI CLIP vs Mask R-CNN Analysis
5 pages
Stable Diffusion Report Updated
No ratings yet
Stable Diffusion Report Updated
19 pages
Tensor Flow Programs
No ratings yet
Tensor Flow Programs
30 pages
Birthday Gift Ideas for Data Scientists
No ratings yet
Birthday Gift Ideas for Data Scientists
1 page
Pipeline Flux Ipa
No ratings yet
Pipeline Flux Ipa
18 pages
QLSTMvs LSTM
No ratings yet
QLSTMvs LSTM
7 pages
Transfer Learning for Beginners
No ratings yet
Transfer Learning for Beginners
7 pages
Install Transformers and Torch
No ratings yet
Install Transformers and Torch
4 pages
PyTorch Cheat Sheet & Quick Reference
No ratings yet
PyTorch Cheat Sheet & Quick Reference
6 pages
Lab 1 Summarize Dialogue
No ratings yet
Lab 1 Summarize Dialogue
26 pages
Image Captioning With Visual Attention PDF
No ratings yet
Image Captioning With Visual Attention PDF
16 pages
Chatbot Code
No ratings yet
Chatbot Code
2 pages
Trainrealfill
No ratings yet
Trainrealfill
19 pages
Deep Learning
No ratings yet
Deep Learning
46 pages
Computer Vision Lab Guide
No ratings yet
Computer Vision Lab Guide
120 pages
Text Summarization with Python Code
No ratings yet
Text Summarization with Python Code
7 pages
Sampleui
No ratings yet
Sampleui
3 pages
Message
No ratings yet
Message
3 pages
Medical Text Classifier GabrieldeOlaguibel
No ratings yet
Medical Text Classifier GabrieldeOlaguibel
12 pages
Lab 5
No ratings yet
Lab 5
7 pages
Code Explanation
No ratings yet
Code Explanation
8 pages
German to English Translation with Transformer
No ratings yet
German to English Translation with Transformer
8 pages
PyTorch NLP Tutorial Documentation
No ratings yet
PyTorch NLP Tutorial Documentation
35 pages
Kijai ComfyUI VEnhancer
No ratings yet
Kijai ComfyUI VEnhancer
76 pages
13 Embeddings - Ipynb
No ratings yet
13 Embeddings - Ipynb
4 pages
Video Retalking Setup & Execution
No ratings yet
Video Retalking Setup & Execution
1 page
Fine-Tuned Vs RAG Short Notes ?
No ratings yet
Fine-Tuned Vs RAG Short Notes ?
25 pages
Largescaiass 2
No ratings yet
Largescaiass 2
7 pages
Assignment 7
No ratings yet
Assignment 7
10 pages
ML 1
No ratings yet
ML 1
22 pages
Etabloc Pump Series Overview
No ratings yet
Etabloc Pump Series Overview
4 pages
CM Strategic Perspective
No ratings yet
CM Strategic Perspective
26 pages
Slidesgo Understanding Network Protocols Foundations and Applications in Modern Communication 202410191644038YnN
No ratings yet
Slidesgo Understanding Network Protocols Foundations and Applications in Modern Communication 202410191644038YnN
10 pages
RCM - Reliability Centred Maintenance: Jørn Vatn
100% (1)
RCM - Reliability Centred Maintenance: Jørn Vatn
46 pages
Module On Power Water and Telecommunication
No ratings yet
Module On Power Water and Telecommunication
10 pages
MAAB Style Guideline Version2p2 PDF
No ratings yet
MAAB Style Guideline Version2p2 PDF
113 pages
Safe Work Method Procedure AC Installation
67% (3)
Safe Work Method Procedure AC Installation
9 pages
GSM PSTN Phone Converter Usermanual
No ratings yet
GSM PSTN Phone Converter Usermanual
5 pages
Design Treatments
No ratings yet
Design Treatments
5 pages
Form - Internal Audit Report (BLANK)
80% (10)
Form - Internal Audit Report (BLANK)
5 pages
Saudi Aramco Inspection Checklist
No ratings yet
Saudi Aramco Inspection Checklist
3 pages
Bass Amplifier: Owner's Manual
No ratings yet
Bass Amplifier: Owner's Manual
4 pages
Free Sample PDF Files for Testing
No ratings yet
Free Sample PDF Files for Testing
2 pages
Squash Court Construction Guide
No ratings yet
Squash Court Construction Guide
26 pages
Org Chart Company
No ratings yet
Org Chart Company
5 pages
BR 3027
No ratings yet
BR 3027
1 page
Pelican PC1000 Certification Details
No ratings yet
Pelican PC1000 Certification Details
2 pages
Borehole Drill Work Plan
100% (1)
Borehole Drill Work Plan
3 pages
Man B&W Manual
50% (2)
Man B&W Manual
12 pages
Regulador de Voltaje Lineal 3.3v G1084-33
100% (1)
Regulador de Voltaje Lineal 3.3v G1084-33
8 pages
GTDS AggregatorTAP Copper010119
No ratings yet
GTDS AggregatorTAP Copper010119
2 pages
Cs 1601 Digital Signal Processing
No ratings yet
Cs 1601 Digital Signal Processing
6 pages
Affinity Brochure
No ratings yet
Affinity Brochure
14 pages
Southern Cable Product Specifications
No ratings yet
Southern Cable Product Specifications
6 pages
Juniper QFX 5100 Core Dump Analysis
No ratings yet
Juniper QFX 5100 Core Dump Analysis
18 pages
Internal Audit Executive
No ratings yet
Internal Audit Executive
7 pages
Understanding the Software Development Life Cycle
No ratings yet
Understanding the Software Development Life Cycle
2 pages
Audit Program
No ratings yet
Audit Program
2 pages
HSBC Middle East Black
No ratings yet
HSBC Middle East Black
1 page
G.A. Rathy, SR Lecturer/Electrical, NITTTR, Chennai
100% (1)
G.A. Rathy, SR Lecturer/Electrical, NITTTR, Chennai
81 pages

IndicTrans2 PDF to Punjabi Docx Conversion

Uploaded by

IndicTrans2 PDF to Punjabi Docx Conversion

Uploaded by

#Run

!git clone [Link]

Requirement already satisfied: pdfplumber in /usr/local/lib/python3.10/dist-

Requirement already satisfied: python-docx in /usr/local/lib/python3.10/dist-

final_output = text_ + table

def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):

return tokenizer, model

# Preprocess the batch and extract entity mappings

# Tokenize the batch and generate input encodings

# Generate translations using the model

# Decode the generated tokens into text

# Postprocess the translations, including entity replacement

src_lang, tgt_lang = "eng_Latn", "pan_Guru"

# flush the models to free the GPU memory

# Set 1-inch margins

# Set font type and size

# Use Raavi font

# Save the document

#Output will be saved into docx

<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer

#process_pdf("/content/2006_1_138_148.pdf") # to see text from pdf

You might also like