Extracting Text from images in python [Pytesseract] and get any email from it using regex














































Extracting Text from images in python [Pytesseract] and get any email from it using regex



import re
import pytesseract
from PIL import Image
import os

#function to extract text from images and check if emails exist in text
def extract():
count = 0
f = open('data.txt','a')
li = []
dir = os.path.dirname(__file__)
imgs = 'imgs/'
dir2 = os.path.join(dir,imgs)
emailpattern = '\s*[0-9a-z\.]+@[-a-z0-9.]*\.[a-z]+'
c = 0
#abs path = dir+imgs
for root,dirs,files in os.walk(imgs):
for file in files:
c = c + 1
img_text = pytesseract.image_to_string(dir2+file)
r = re.findall(emailpattern,img_text)
print("working with page->",c)
for i in r:
count +=1
f.write(str(i)+"\n")

print(count)
f.close()


extract()


Comments