Python - Lese pdf og docx- filer
Hvordan lese pdf og docx filer i Python
For et prosjekt dukket behovet opp for å lese igjennom flere tusen pdf og docx- filer for å finne mønstre / informasjon. Første steg i dette prosjektet blir å åpne alle filene, hente ut teksten og sende den til en egen metode for parsing.
import re
from docx2python import docx2python # pip install doc2xpython
from pdfminer.high_level import extract_text # pip install pdfminer.six
# For textract to work with .doc format we need the antiword package, LibreOffice and a symlink hack
# sudo ln -sf /Applications/LibreOffice.app/Contents/MacOS/soffice /usr/local/bin/libreoffice
import textract # pip install textract, pip install antiword
def read_doc(filename):
try:
text = textract.process(filename).decode()
return_result = text
except Exception as e:
return_result = "fail"
return return_result
def read_docx(filename):
try:
return_result = docx2python(filename)
return_result = return_result.text
except Exception as e:
return_result = "fail"
return return_result
def read_pdf(filename):
try:
return_result = repr(extract_text(filename))
except Exception as e:
return_result = "fail"
return return_result
def file_parser(the_file):
# Parse your file if needed.
pass
# List all docx and pdf files in the folder
results_doc = []
results_docx = []
results_pdf = []
docfolder = "files"
for f in os.listdir(docfolder):
if re.search('.doc$', f):
results_doc += [f]
if re.search('.docx', f):
results_docx += [f]
if re.search('.pdf', f):
results_pdf += [f]
for file in results_doc:
the_file = read_doc(docfolder + "/" + fil)
if (the_file == "fail"):
continue
if (the_file != "fail"):
file_parser(the_file)
for file in results_docx:
the_file = read_docx(docfolder + "/" + fil)
if (the_file == "fail"):
continue
if (the_file != "fail"):
file_parser(the_file)
for file in results_pdf:
the_file = read_pdf(docfolder + "/" + fil)
if (the_file == "fail"):
continue
if (the_file != "fail"):
file_parser(the_file)
Det som skjer i denne kodesnutten er vi går igjennom alle filer i en folder som har filendelsen .docx eller .pdf. Vi leser hver fil ved hjelp av henholdsvis docx2python (.docx) og pdfminer.six (.pdf), henter ut det som finnes av tekst og sender dette videre til en egen metode for parsing.