#Run
%%capture
!git clone [Link]
#Run
%%capture
%cd /content/IndicTrans2/huggingface_interface
#Run
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2
mosestokenizer
!python3 -c "import nltk; [Link]('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece
!git clone [Link]
%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..
#Run
!pip install pdfplumber
Requirement already satisfied: pdfplumber in /usr/local/lib/python3.10/dist-
packages (0.11.0)
Requirement already satisfied: [Link]==20231228 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (20231228)
Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.10/dist-
packages (from pdfplumber) (9.4.0)
Requirement already satisfied: pypdfium2>=4.18.0 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (4.30.0)
Requirement already satisfied: charset-normalizer>=2.0.0 in
/usr/local/lib/python3.10/dist-packages (from [Link]==20231228->pdfplumber)
(3.3.2)
Requirement already satisfied: cryptography>=36.0.0 in
/usr/local/lib/python3.10/dist-packages (from [Link]==20231228->pdfplumber)
(42.0.7)
Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-
packages (from cryptography>=36.0.0->[Link]==20231228->pdfplumber) (1.16.0)
Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-
packages (from cffi>=1.12->cryptography>=36.0.0->[Link]==20231228-
>pdfplumber) (2.22)
#Run
!pip install python-docx
Requirement already satisfied: python-docx in /usr/local/lib/python3.10/dist-
packages (1.1.2)
Requirement already satisfied: lxml>=3.1.0 in /usr/local/lib/python3.10/dist-
packages (from python-docx) (4.9.4)
Requirement already satisfied: typing-extensions>=4.9.0 in
/usr/local/lib/python3.10/dist-packages (from python-docx) (4.12.1)
##Resart session
# Run
import pdfplumber
from docx import Document
from [Link] import Pt, Inches
from [Link] import qn
from [Link] import OxmlElement
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer
#Run
def process_pdf(path):
with [Link](path) as pdf:
text_ = []
table = []
for page in [Link]:
text = page.extract_text()
if text:
text_.append(text)
for table in page.extract_tables():
table_summaries.append({"type": "table", "content": table})
final_output = text_ + table
return final_output
def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
if quantization == "4-bit":
qconfig = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
elif quantization == "8-bit":
qconfig = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_use_double_quant=True,
bnb_8bit_compute_dtype=torch.bfloat16,
)
else:
qconfig = None
tokenizer = IndicTransTokenizer(direction=direction)
model = AutoModelForSeq2SeqLM.from_pretrained(
ckpt_dir,
trust_remote_code=True,
low_cpu_mem_usage=True,
quantization_config=qconfig,
)
if qconfig == None:
model = [Link](DEVICE)
if DEVICE == "cuda":
[Link]()
[Link]()
return tokenizer, model
def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
translations = []
for i in range(0, len(input_sentences), BATCH_SIZE):
batch = input_sentences[i : i + BATCH_SIZE]
# Preprocess the batch and extract entity mappings
batch = ip.preprocess_batch(batch, src_lang=src_lang,
tgt_lang=tgt_lang)
# Tokenize the batch and generate input encodings
inputs = tokenizer(
batch,
src=True,
truncation=True,
padding="longest",
return_tensors="pt",
return_attention_mask=True,
).to(DEVICE)
# Generate translations using the model
with torch.no_grad():
generated_tokens = [Link](
**inputs,
use_cache=True,
min_length=0,
max_length=256,
num_beams=5,
num_return_sequences=1,
)
# Decode the generated tokens into text
generated_tokens =
tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)
# Postprocess the translations, including entity replacement
translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)
del inputs
[Link].empty_cache()
return translations
BATCH_SIZE = 4
DEVICE = "cuda" if [Link].is_available() else "cpu"
quantization = None
def main_fuc(path,filename):
text = process_pdf(path)
ip = IndicProcessor(inference=True)
src_lang, tgt_lang = "eng_Latn", "pan_Guru"
main_text = ''
for i in text:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B" #
ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model =
initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)
ip = IndicProcessor(inference=True)
en_sents = [i]
hi_translations = batch_translate(en_sents, src_lang, tgt_lang,
en_indic_model, en_indic_tokenizer, ip)
main_text += hi_translations[0]
# flush the models to free the GPU memory
del en_indic_tokenizer, en_indic_model
# Create a new Document
doc = Document()
# Set 1-inch margins
sections = [Link]
for section in sections:
section.top_margin = Inches(1)
section.bottom_margin = Inches(1)
section.left_margin = Inches(1)
section.right_margin = Inches(1)
# Add a paragraph
paragraph = doc.add_paragraph(str(main_text))
# Set font type and size
run = [Link][0]
[Link] = Pt(12)
# Use Raavi font
[Link] = 'Raavi'
r = run._element
[Link](qn('w:eastAsia'), 'Raavi')
# Save the document
[Link](filename+".docx")
print("complete")
# 1. Filepath 2. Filename
Filepath = "/content/2003_S1_278_306.pdf"
file = "2003_S1_278_306"
#Run
main_fuc(Filepath,"file")
#Output will be saved into docx
<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer
is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89:
UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab
([Link] set it as secret in your Google Colab and
restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access
public models or datasets.
[Link](
<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer
is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)
#process_pdf("/content/2006_1_138_148.pdf") # to see text from pdf