import argparse import fitz import requests # 提取PDF文本 defextract_text(pdf_path): doc = fitz.open(pdf_path) # 打开PDF文件 remaining_text = "" for page in doc: blocks = page.get_text("dict")["blocks"] for block in blocks: if block['type'] == 0: # 只处理文本块 for line in block['lines']: for span in line['spans']: text = span['text'] span_rect = fitz.Rect(span['bbox']) remaining_text += (text+'\n') break# 只处理第一页 return remaining_text
defollama_extract_keywords(url,text): api=url data={ "model": "qwen2.5:14b", "stream": False, "messages": [ { "role": "user", "content": f"On the first page of a paper, please extract all the keywords that come with it, only the keywords written by the author in the keyword section, and only tell me the results. Remember to remove all symbols, do not reply to other messages, and use commas to separate each keyword:\n{text}" } ], "options":{ "temperature": 0 } } res=requests.post(url,json=data) if res.status_code==200: keywords=res.json() return keywords['message']['content'].strip() else: return NULL # 处理文件夹中的所有PDF文件 defprocess_pdfs_in_folder(url, file_path):
# 提取PDF文本 text = extract_text(file_path)
results = []
# 调用 Ollama 模型获取关键词
# 解析返回的结果并提取关键词 keywords_str = ollama_extract_keywords(url, text) whilenot keywords_str: keywords_str = ollama_extract_keywords(url, text) keywords_list = process_keywords(keywords_str) # 处理关键词字符串,返回列表 results = [] for i in keywords_list: results.append(i.lower().replace('-',' '))
return results
# 处理返回的关键词字符串并将其转换为列表 defprocess_keywords(keywords_str): # 假设关键词以逗号分隔 keywords = [keyword.strip() for keyword in keywords_str.split(',')] return keywords
defmain(): parser = argparse.ArgumentParser() parser.add_argument("-f", "--file", required=True, help="Path of the PDF files") parser.add_argument("-u", "--url", required=True, help="OLLAMA API URL") args = parser.parse_args()