Tested and working python function to split Bengali Paragraph or Huge text into pieces of maximum characters per paragraph.
def process_bengali_text(text):
max_chars = 300
# Define a set of punctuation marks that can indicate line breaks
punctuation_marks = ['।', '?', '!', '።', '፧', '፨'] # Add more if necessary
lines = []
current_line = ''
force_split = False
for char in text:
if len(current_line) > 290:
if char == " ":
current_line += char
lines.append(current_line.strip())
current_line = ''
continue
if char in punctuation_marks:
# Check if adding the character exceeds the maximum characters
if len(current_line) + 1 <= max_chars:
current_line += char
lines.append(current_line.strip())
current_line = ''
else:
lines.append(current_line.strip())
current_line = char
else:
current_line += char
# Add the remaining line if it's not empty
if current_line:
lines.append(current_line.strip())
listText = lines
newList = []
continue_next = False
for i in range(0,len(listText)):
if not listText[i] == "":
if not i == (len(listText)-1):
if not continue_next:
if len(listText[i] + listText[i+1]) < 300:
newList.append(listText[i]+listText[i+1])
continue_next = True
else:
newList.append(listText[i])
else:
continue_next = False
continue
else:
newList.append(listText[i])
if len(newList) > 1:
if newList[len(newList)-1] in newList[len(newList)-2]:
newList.remove(newList[len(newList)-1])
return newList
Comments
Post a Comment