1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
| import argparse import fitz import requests
def extract_text_without_headers_footers(pdf_path, header_height=50, footer_height=50): doc = fitz.open(pdf_path) remaining_text = "" for page in doc: blocks = page.get_text("dict")["blocks"] for block in blocks: if block['type'] == 0: for line in block['lines']: for span in line['spans']: text = span['text'] span_rect = fitz.Rect(span['bbox']) remaining_text += (text+'\n') break return remaining_text
def ollama_extract_keywords(url,text): api=url data={ "model": "qwen2.5:14b", "stream": False, "messages": [ { "role": "user", "content": f"On the first page of a paper, please extract all the keywords that come with it, only the keywords written by the author in the keyword section, and only tell me the results. Remember to remove all symbols, do not reply to other messages, and use commas to separate each keyword:\n{text}" } ], "options":{ "temperature": 0 } } res=requests.post(url,json=data) if res.status_code==200: keywords=res.json() return keywords['message']['content'].strip() else: return NULL
def process_pdfs_in_folder(url, file_path): text = extract_text_without_headers_footers(file_path)
results = []
keywords_str = ollama_extract_keywords(url, text) while not keywords_str: keywords_str = ollama_extract_keywords(url, text) keywords_list = process_keywords(keywords_str) results = [] for i in keywords_list: results.append(i.lower().replace('-',' '))
return results
def process_keywords(keywords_str): keywords = [keyword.strip() for keyword in keywords_str.split(',')] return keywords
def main(): parser = argparse.ArgumentParser() parser.add_argument("-f", "--file", required=True, help="Path of the PDF files") parser.add_argument("-u", "--url", required=True, help="OLLAMA API URL") args = parser.parse_args()
file_path = args.file api_url = args.url
results = process_pdfs_in_folder(api_url, file_path)
for i in results: print(i)
if __name__ == "__main__": main()
|