0% found this document useful (0 votes)
87 views5 pages

IndicTrans2 PDF to Punjabi Docx Conversion

Contains python script for translation.

Uploaded by

Pragit Sharma
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
87 views5 pages

IndicTrans2 PDF to Punjabi Docx Conversion

Contains python script for translation.

Uploaded by

Pragit Sharma
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

#Run

%%capture
!git clone [Link]

#Run
%%capture
%cd /content/IndicTrans2/huggingface_interface

#Run
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2
mosestokenizer
!python3 -c "import nltk; [Link]('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone [Link]


%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..

#Run
!pip install pdfplumber

Requirement already satisfied: pdfplumber in /usr/local/lib/python3.10/dist-


packages (0.11.0)
Requirement already satisfied: [Link]==20231228 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (20231228)
Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.10/dist-
packages (from pdfplumber) (9.4.0)
Requirement already satisfied: pypdfium2>=4.18.0 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (4.30.0)
Requirement already satisfied: charset-normalizer>=2.0.0 in
/usr/local/lib/python3.10/dist-packages (from [Link]==20231228->pdfplumber)
(3.3.2)
Requirement already satisfied: cryptography>=36.0.0 in
/usr/local/lib/python3.10/dist-packages (from [Link]==20231228->pdfplumber)
(42.0.7)
Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-
packages (from cryptography>=36.0.0->[Link]==20231228->pdfplumber) (1.16.0)
Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-
packages (from cffi>=1.12->cryptography>=36.0.0->[Link]==20231228-
>pdfplumber) (2.22)

#Run
!pip install python-docx

Requirement already satisfied: python-docx in /usr/local/lib/python3.10/dist-


packages (1.1.2)
Requirement already satisfied: lxml>=3.1.0 in /usr/local/lib/python3.10/dist-
packages (from python-docx) (4.9.4)
Requirement already satisfied: typing-extensions>=4.9.0 in
/usr/local/lib/python3.10/dist-packages (from python-docx) (4.12.1)

##Resart session

# Run
import pdfplumber
from docx import Document
from [Link] import Pt, Inches
from [Link] import qn
from [Link] import OxmlElement
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

#Run
def process_pdf(path):
with [Link](path) as pdf:
text_ = []
table = []
for page in [Link]:
text = page.extract_text()

if text:
text_.append(text)
for table in page.extract_tables():
table_summaries.append({"type": "table", "content": table})

final_output = text_ + table


return final_output

def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):


if quantization == "4-bit":
qconfig = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
elif quantization == "8-bit":
qconfig = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_use_double_quant=True,
bnb_8bit_compute_dtype=torch.bfloat16,
)
else:
qconfig = None

tokenizer = IndicTransTokenizer(direction=direction)
model = AutoModelForSeq2SeqLM.from_pretrained(
ckpt_dir,
trust_remote_code=True,
low_cpu_mem_usage=True,
quantization_config=qconfig,
)

if qconfig == None:
model = [Link](DEVICE)
if DEVICE == "cuda":
[Link]()

[Link]()

return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
translations = []
for i in range(0, len(input_sentences), BATCH_SIZE):
batch = input_sentences[i : i + BATCH_SIZE]

# Preprocess the batch and extract entity mappings


batch = ip.preprocess_batch(batch, src_lang=src_lang,
tgt_lang=tgt_lang)

# Tokenize the batch and generate input encodings


inputs = tokenizer(
batch,
src=True,
truncation=True,
padding="longest",
return_tensors="pt",
return_attention_mask=True,
).to(DEVICE)

# Generate translations using the model


with torch.no_grad():
generated_tokens = [Link](
**inputs,
use_cache=True,
min_length=0,
max_length=256,
num_beams=5,
num_return_sequences=1,
)

# Decode the generated tokens into text


generated_tokens =
tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)

# Postprocess the translations, including entity replacement


translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

del inputs
[Link].empty_cache()

return translations

BATCH_SIZE = 4
DEVICE = "cuda" if [Link].is_available() else "cpu"
quantization = None
def main_fuc(path,filename):
text = process_pdf(path)
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "eng_Latn", "pan_Guru"

main_text = ''
for i in text:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B" #
ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model =
initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)

ip = IndicProcessor(inference=True)
en_sents = [i]
hi_translations = batch_translate(en_sents, src_lang, tgt_lang,
en_indic_model, en_indic_tokenizer, ip)
main_text += hi_translations[0]

# flush the models to free the GPU memory


del en_indic_tokenizer, en_indic_model
# Create a new Document
doc = Document()

# Set 1-inch margins


sections = [Link]
for section in sections:
section.top_margin = Inches(1)
section.bottom_margin = Inches(1)
section.left_margin = Inches(1)
section.right_margin = Inches(1)

# Add a paragraph
paragraph = doc.add_paragraph(str(main_text))

# Set font type and size


run = [Link][0]
[Link] = Pt(12)

# Use Raavi font


[Link] = 'Raavi'
r = run._element
[Link](qn('w:eastAsia'), 'Raavi')

# Save the document


[Link](filename+".docx")
print("complete")

# 1. Filepath 2. Filename
Filepath = "/content/2003_S1_278_306.pdf"
file = "2003_S1_278_306"

#Run
main_fuc(Filepath,"file")

#Output will be saved into docx

<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer


is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89:
UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab
([Link] set it as secret in your Google Colab and
restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access
public models or datasets.
[Link](
<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer
is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)

#process_pdf("/content/2006_1_138_148.pdf") # to see text from pdf

You might also like