Files
kotaemon/libs/ktem/ktem/index/file/utils.py
Tuan Anh Nguyen Dang (Tadashi_Cin) 3bd3830b8d feat: sso login, demo mode & new mindmap support (#644) bump:minor
* fix: update .env.example

* feat: add SSO login

* fix: update flowsetting

* fix: add requirement

* fix: refine UI

* fix: update group id-based operation

* fix: improve citation logics

* fix: UI enhancement

* fix: user_id to string in models

* fix: improve chat suggestion UI and flow

* fix: improve group id handling

* fix: improve chat suggestion

* fix: secure download for single file

* fix: file limiting in docstore

* fix: improve chat suggestion logics & language conform

* feat: add markmap and select text to highlight function

* fix: update Dockerfile

* fix: user id auto generate

* fix: default user id

* feat: add demo mode

* fix: update flowsetting

* fix: revise default params for demo

* feat: sso_app alternative

* feat: sso login demo

* feat: demo specific customization

* feat: add login using API key

* fix: disable key-based login

* fix: optimize duplicate upload

* fix: gradio routing

* fix: disable arm build for demo

* fix: revise full-text search js logic

* feat: add rate limit

* fix: update Dockerfile with new launch script

* fix: update Dockerfile

* fix: update Dockerignore

* fix: update ratelimit logic

* fix: user_id in user management page

* fix: rename conv logic

* feat: update demo hint

* fix: minor fix

* fix: highlight on long PDF load

* feat: add HF paper list

* fix: update HF papers load logic

* feat: fly config

* fix: update fly config

* fix: update paper list pull api

* fix: minor update root routing

* fix: minor update root routing

* fix: simplify login flow & paper list UI

* feat: add paper recommendation

* fix: update Dockerfile

* fix: update Dockerfile

* fix: update default model

* feat: add long context Ollama through LCOllama

* feat: espose Gradio share to env

* fix: revert customized changes

* fix: list group at app load

* fix: relocate share conv button

* fix: update launch script

* fix: update Docker CI

* feat: add Ollama model selection at first setup

* docs: update README
2025-02-02 15:19:48 +07:00

59 lines
1.5 KiB
Python

import os
import requests
# regex patterns for Arxiv URL
ARXIV_URL_PATTERNS = [
"https://arxiv.org/abs/",
"https://arxiv.org/pdf/",
]
ILLEGAL_NAME_CHARS = ["\\", "/", ":", "*", "?", '"', "<", ">", "|"]
def clean_name(name):
for char in ILLEGAL_NAME_CHARS:
name = name.replace(char, "_")
return name
def is_arxiv_url(url):
return any(url.startswith(pattern) for pattern in ARXIV_URL_PATTERNS)
# download PDF from Arxiv URL
def download_arxiv_pdf(url, output_path):
if not is_arxiv_url(url):
raise ValueError("Invalid Arxiv URL")
is_abstract_url = "abs" in url
if is_abstract_url:
pdf_url = url.replace("abs", "pdf")
abstract_url = url
else:
pdf_url = url
abstract_url = url.replace("pdf", "abs")
# get paper name from abstract url
response = requests.get(abstract_url)
# parse HTML response and get h1.title
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
name = clean_name(
soup.find("h1", class_="title").text.strip().replace("Title:", "")
)
if not name:
raise ValueError("Failed to get paper name")
output_file_path = os.path.join(output_path, name + ".pdf")
# prevent downloading if file already exists
if not os.path.exists(output_file_path):
response = requests.get(pdf_url)
with open(output_file_path, "wb") as f:
f.write(response.content)
return output_file_path