import pdfplumber
import requests
import json
from io import BytesIO
class PDFDataProcessor:
def __init__(self):
self.pdf_url = None
def set_pdf_url(self, pdf_url):
self.pdf_url = pdf_url
def download_pdf(self):
try:
response = requests.get(self.pdf_url)
response.raise_for_status() # エラーハンドリング
return response.content
except requests.exceptions.RequestException as e:
print("Error downloading PDF:", e)
return None
def extract_text_from_pdf(self, pdf_content):
pdf_text = ""
pdf_buffer = BytesIO(pdf_content)
with pdfplumber.open(pdf_buffer) as pdf:
for page in pdf.pages:
pdf_text += page.extract_text()
return pdf_text
def preprocess_text(self, text):
# ここにテキストの前処理を追加
preprocessed_text = text.replace('\n', ' ').strip()
return preprocessed_text
def process_pdf_to_json(self):
pdf_content = self.download_pdf()
if pdf_content:
pdf_text = self.extract_text_from_pdf(pdf_content)
preprocessed_text = self.preprocess_text(pdf_text)
processed_data = {
"pdf_url": self.pdf_url,
"text": preprocessed_text,
}
return json.dumps(processed_data, indent=4, ensure_ascii=False)
else:
return None