两行代码读取pdf、docx文件

2020-06-22 11:29 881人阅读评论(0)

最近运行课件代码，发现pdf文件读取部分的函数失效。这里找到读取pdf文件的可运行代码，为了方便后续学习使用，我已将pdf和docx读取方法封装成pdfdocx包。

pdfdocx

只有简单的两个读取函数

read_pdf(file)
read_docx(file)

file为文件路径，函数运行后返回file文件内的文本数据。

安装

pip install pdfdocx

使用

读取pdf文件


   
    
     
      
     
     
      
       from pdfdocx 
       import read_pdf
      
     
    
     
      
     
     
      
       p_text = read_pdf(
       'test/data.pdf')
      
     
    
     
      
     
     
      
       print(p_text)

Run

这是来⾃pdf⽂件内的内容


   
    
     
      
     
     
      
       from pdfdocx 
       import read_docx
      
     
    
     
      
     
     
      
       d_text = read_pdf(
       'test/data.docx')
      
     
    
     
      
     
     
      
       print(d_text)

Run

这是来⾃docx⽂件内的内容

拆开pdfdocx

希望大家能安装好，如果安装或者使用失败，可以使用下面的代码作为备选方案

读取pdf


   
    
     
      
     
     
      
       from io 
       import StringIO
      
     
    
     
      
     
     
      
       from pdfminer.converter 
       import TextConverter
      
     
    
     
      
     
     
      
       from pdfminer.layout 
       import LAParams
      
     
    
     
      
     
     
      
       from pdfminer.pdfdocument 
       import PDFDocument
      
     
    
     
      
     
     
      
       from pdfminer.pdfinterp 
       import PDFResourceManager, PDFPageInterpreter
      
     
    
     
      
     
     
      
       from pdfminer.pdfpage 
       import PDFPage
      
     
    
     
      
     
     
      
       from pdfminer.pdfparser 
       import PDFParser
      
     
    
     
      
     
     
      
       import re
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       def read_pdf(file):
      
     
    
     
      
     
     
      
           
       ""
       "
      
     
    
     
      
     
     
      
           读取pdf文件，并返回其中的文本内容
      
     
    
     
      
     
     
      
           :param file: pdf文件路径
      
     
    
     
      
     
     
      
           :return: docx中的文本内容
      
     
    
     
      
     
     
      
           "
       ""
      
     
    
     
      
     
     
      
           output_string = StringIO()
      
     
    
     
      
     
     
      
           with open(file, 
       'rb') as in_file:
      
     
    
     
      
     
     
      
               parser = PDFParser(in_file)
      
     
    
     
      
     
     
      
               doc = PDFDocument(parser)
      
     
    
     
      
     
     
      
               rsrcmgr = PDFResourceManager()
      
     
    
     
      
     
     
      
               device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
      
     
    
     
      
     
     
      
               interpreter = PDFPageInterpreter(rsrcmgr, device)
      
     
    
     
      
     
     
      
               
       for page in PDFPage.create_pages(doc):
      
     
    
     
      
     
     
      
                   interpreter.process_page(page)
      
     
    
     
      
     
     
      
           text = output_string.getvalue()
      
     
    
     
      
     
     
      
           
       return text

读取docx


   
    
     
      
     
     
      
       import docx
      
     
    
     
      
     
     
      
         
      
     
    
     
      
     
     
      
       def read_docx(file):
      
     
    
     
      
     
     
      
           
       ""
       "
      
     
    
     
      
     
     
      
           读取docx文件，并返回其中的文本内容
      
     
    
     
      
     
     
      
           :param file: docx文件路径
      
     
    
     
      
     
     
      
           :return: docx中的文本内容
      
     
    
     
      
     
     
      
           "
       ""
      
     
    
     
      
     
     
      
           text = 
       ''
      
     
    
     
      
     
     
      
           doc = docx.Document(file)
      
     
    
     
      
     
     
      
           for para in doc.paragraphs:
      
     
    
     
      
     
     
      
               text += para.text
      
     
    
     
      
     
     
      
           return text