Python program for word and sentence tokenization
import re
def tokenize_words_22011A6605(text_22011A6605):
res_22011A6605 = []
for word_22011A6605 in text_22011A6605.replace('\n', ' ').split():
rds_22011A6605, w_22011A6605, vb_22011A6605 = [], "", ""
for i_22011A6605 in word_22011A6605:
if i_22011A6605 in "{[(<":
rds_22011A6605.append(i_22011A6605)
elif i_22011A6605.isalpha() or i_22011A6605 in "'":
w_22011A6605 += i_22011A6605
else:
if w_22011A6605:
rds_22011A6605.append("I" if w_22011A6605 == "I'm" else w_22011A6605)
w_22011A6605 = ""
if i_22011A6605 in "}])?!":
rds_22011A6605.append(i_22011A6605)
else:
vb_22011A6605 += i_22011A6605
if w_22011A6605:
rds_22011A6605.append("I" if w_22011A6605 == "I'm" else w_22011A6605)
if vb_22011A6605:
rds_22011A6605.append(vb_22011A6605)
res_22011A6605.extend(rds_22011A6605)
return res_22011A6605
def tokenize_sentences_22011A6605(text_22011A6605):
sentence_pattern_22011A6605 = r'([^.!?]+[.!?])'
sentences_22011A6605 = [Link](sentence_pattern_22011A6605, text_22011A6605,
[Link])
sentences_22011A6605 = [sent_22011A6605.strip() for sent_22011A6605 in
sentences_22011A6605]
return sentences_22011A6605
print(tokenize_words_22011A6605("Hello! How are you? I'm fine. Thanks for asking..."))
print(tokenize_sentences_22011A6605("Hello! How are you? I'm fine. Thanks for asking..."))
Output: