In is project I solve the problem of extracting data out of Images, this Project focus on Invoices Data Extracting which detects Invoice Images Data and Extracting records like Invoice Date and Items Description based on tesseract OCR Engine.
from PIL import Image
import os
import pandas as pd
import numpy as np
import re,string,unicodedata
#Tesseract Library
import pytesseract
from pytesseract import Output
#Warnings
import warnings
warnings.filterwarnings("ignore")
#Garbage Collection
import gc
#Gensim Library for Text Processing
import gensim.parsing.preprocessing as gsp
from gensim import utils
pytesseract.image_to_data("../input/invoice-ocr-data/invoice_2.jpg",output_type = Output.DATAFRAME)
pytesseract.image_to_string(Image.open(filepath), timeout=5)
# Create list of pre-processing func (gensim)
processes = [
gsp.strip_tags,
gsp.strip_multiple_whitespaces,
gsp.remove_stopwords,
]



