mirror of
https://github.com/Cinnamon/kotaemon.git
synced 2025-12-16 11:47:48 +01:00
* fix: update .env.example * feat: add SSO login * fix: update flowsetting * fix: add requirement * fix: refine UI * fix: update group id-based operation * fix: improve citation logics * fix: UI enhancement * fix: user_id to string in models * fix: improve chat suggestion UI and flow * fix: improve group id handling * fix: improve chat suggestion * fix: secure download for single file * fix: file limiting in docstore * fix: improve chat suggestion logics & language conform * feat: add markmap and select text to highlight function * fix: update Dockerfile * fix: user id auto generate * fix: default user id * feat: add demo mode * fix: update flowsetting * fix: revise default params for demo * feat: sso_app alternative * feat: sso login demo * feat: demo specific customization * feat: add login using API key * fix: disable key-based login * fix: optimize duplicate upload * fix: gradio routing * fix: disable arm build for demo * fix: revise full-text search js logic * feat: add rate limit * fix: update Dockerfile with new launch script * fix: update Dockerfile * fix: update Dockerignore * fix: update ratelimit logic * fix: user_id in user management page * fix: rename conv logic * feat: update demo hint * fix: minor fix * fix: highlight on long PDF load * feat: add HF paper list * fix: update HF papers load logic * feat: fly config * fix: update fly config * fix: update paper list pull api * fix: minor update root routing * fix: minor update root routing * fix: simplify login flow & paper list UI * feat: add paper recommendation * fix: update Dockerfile * fix: update Dockerfile * fix: update default model * feat: add long context Ollama through LCOllama * feat: espose Gradio share to env * fix: revert customized changes * fix: list group at app load * fix: relocate share conv button * fix: update launch script * fix: update Docker CI * feat: add Ollama model selection at first setup * docs: update README
59 lines
1.5 KiB
Python
59 lines
1.5 KiB
Python
import os
|
|
|
|
import requests
|
|
|
|
# regex patterns for Arxiv URL
|
|
ARXIV_URL_PATTERNS = [
|
|
"https://arxiv.org/abs/",
|
|
"https://arxiv.org/pdf/",
|
|
]
|
|
|
|
ILLEGAL_NAME_CHARS = ["\\", "/", ":", "*", "?", '"', "<", ">", "|"]
|
|
|
|
|
|
def clean_name(name):
|
|
for char in ILLEGAL_NAME_CHARS:
|
|
name = name.replace(char, "_")
|
|
return name
|
|
|
|
|
|
def is_arxiv_url(url):
|
|
return any(url.startswith(pattern) for pattern in ARXIV_URL_PATTERNS)
|
|
|
|
|
|
# download PDF from Arxiv URL
|
|
def download_arxiv_pdf(url, output_path):
|
|
if not is_arxiv_url(url):
|
|
raise ValueError("Invalid Arxiv URL")
|
|
|
|
is_abstract_url = "abs" in url
|
|
if is_abstract_url:
|
|
pdf_url = url.replace("abs", "pdf")
|
|
abstract_url = url
|
|
else:
|
|
pdf_url = url
|
|
abstract_url = url.replace("pdf", "abs")
|
|
|
|
# get paper name from abstract url
|
|
response = requests.get(abstract_url)
|
|
|
|
# parse HTML response and get h1.title
|
|
from bs4 import BeautifulSoup
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
name = clean_name(
|
|
soup.find("h1", class_="title").text.strip().replace("Title:", "")
|
|
)
|
|
if not name:
|
|
raise ValueError("Failed to get paper name")
|
|
|
|
output_file_path = os.path.join(output_path, name + ".pdf")
|
|
# prevent downloading if file already exists
|
|
if not os.path.exists(output_file_path):
|
|
response = requests.get(pdf_url)
|
|
|
|
with open(output_file_path, "wb") as f:
|
|
f.write(response.content)
|
|
|
|
return output_file_path
|