Add scheduler
Browse files- app.py +12 -1
- requirements.txt +3 -1
- update_scheduler.py +114 -0
app.py
CHANGED
@@ -1,12 +1,23 @@
|
|
1 |
#!/usr/bin/env python
|
2 |
|
|
|
|
|
3 |
import gradio as gr
|
4 |
|
5 |
from papers import PaperList, get_df
|
|
|
|
|
|
|
6 |
|
7 |
paper_list = PaperList(get_df('papers.csv'))
|
8 |
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
with gr.Blocks(css='style.css') as demo:
|
12 |
gr.Markdown(DESCRIPTION)
|
|
|
1 |
#!/usr/bin/env python
|
2 |
|
3 |
+
import os
|
4 |
+
|
5 |
import gradio as gr
|
6 |
|
7 |
from papers import PaperList, get_df
|
8 |
+
from update_scheduler import UpdateScheduler
|
9 |
+
|
10 |
+
DESCRIPTION = '''# list of [Daily Papers](https://huggingface.co/papers)'''
|
11 |
|
12 |
paper_list = PaperList(get_df('papers.csv'))
|
13 |
|
14 |
+
if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
|
15 |
+
CRON_HOUR = os.getenv('CRON_HOUR', '*/4')
|
16 |
+
CRON_MINUTE = os.getenv('CRON_MINUTE', '0')
|
17 |
+
scheduler = UpdateScheduler(space_id=SPACE_ID,
|
18 |
+
cron_hour=CRON_HOUR,
|
19 |
+
cron_minute=CRON_MINUTE)
|
20 |
+
scheduler.start()
|
21 |
|
22 |
with gr.Blocks(css='style.css') as demo:
|
23 |
gr.Markdown(DESCRIPTION)
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
|
|
1 |
gradio==3.39.0
|
|
|
2 |
pandas==2.0.3
|
3 |
requests==2.31.0
|
4 |
-
tqdm==4.
|
|
|
1 |
+
apscheduler==3.10.3
|
2 |
gradio==3.39.0
|
3 |
+
huggingface_hub==0.16.4
|
4 |
pandas==2.0.3
|
5 |
requests==2.31.0
|
6 |
+
tqdm==4.66.1
|
update_scheduler.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import pathlib
|
3 |
+
import re
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
import pandas as pd
|
7 |
+
import requests
|
8 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
9 |
+
from huggingface_hub import HfApi, Repository
|
10 |
+
from huggingface_hub.utils import RepositoryNotFoundError
|
11 |
+
|
12 |
+
|
13 |
+
class SpaceRestarter:
|
14 |
+
def __init__(self, space_id: str):
|
15 |
+
self.api = HfApi()
|
16 |
+
if self.api.get_token_permission() != 'write':
|
17 |
+
raise ValueError('The HF token must have write permission.')
|
18 |
+
try:
|
19 |
+
self.api.space_info(repo_id=space_id)
|
20 |
+
except RepositoryNotFoundError:
|
21 |
+
raise ValueError('The Space ID does not exist.')
|
22 |
+
self.space_id = space_id
|
23 |
+
|
24 |
+
def restart(self) -> None:
|
25 |
+
self.api.restart_space(self.space_id)
|
26 |
+
|
27 |
+
|
28 |
+
def find_github_links(summary: str) -> str:
|
29 |
+
links = re.findall(
|
30 |
+
r'https://github.com/[^/]+/[^/)}, ]+(?:/(?:tree|blob)/[^/]+/[^/)}, ]+)?',
|
31 |
+
summary)
|
32 |
+
if len(links) == 0:
|
33 |
+
return ''
|
34 |
+
if len(links) != 1:
|
35 |
+
raise RuntimeError(f'Found multiple GitHub links: {links}')
|
36 |
+
link = links[0]
|
37 |
+
if link.endswith('.'):
|
38 |
+
link = link[:-1]
|
39 |
+
link = link.strip()
|
40 |
+
return link
|
41 |
+
|
42 |
+
|
43 |
+
class RepoUpdater:
|
44 |
+
def __init__(self, repo_id: str, repo_type: str):
|
45 |
+
api = HfApi()
|
46 |
+
name = api.whoami()['name']
|
47 |
+
|
48 |
+
self.repo_dir = pathlib.Path(
|
49 |
+
tempfile.tempdir) / repo_id.split('/')[-1] # type: ignore
|
50 |
+
self.repo = Repository(
|
51 |
+
local_dir=self.repo_dir,
|
52 |
+
clone_from=repo_id,
|
53 |
+
repo_type=repo_type,
|
54 |
+
git_user=name,
|
55 |
+
git_email=f'{name}@users.noreply.huggingface.co')
|
56 |
+
self.repo.git_pull()
|
57 |
+
|
58 |
+
def update(self) -> None:
|
59 |
+
yesterday = (datetime.datetime.now() -
|
60 |
+
datetime.timedelta(days=1)).strftime('%Y-%m-%d')
|
61 |
+
today = datetime.datetime.now().strftime('%Y-%m-%d')
|
62 |
+
daily_papers = requests.get(
|
63 |
+
f'https://huggingface.co/api/daily_papers?date={yesterday}').json(
|
64 |
+
)
|
65 |
+
daily_papers += requests.get(
|
66 |
+
f'https://huggingface.co/api/daily_papers?date={today}').json()
|
67 |
+
|
68 |
+
self.repo.git_pull()
|
69 |
+
df = pd.read_csv(self.repo_dir / 'papers.csv', dtype=str).fillna('')
|
70 |
+
rows = [row for _, row in df.iterrows()]
|
71 |
+
arxiv_ids = {row.arxiv_id for row in rows}
|
72 |
+
|
73 |
+
for paper in daily_papers:
|
74 |
+
arxiv_id = paper['paper']['id']
|
75 |
+
if arxiv_id in arxiv_ids:
|
76 |
+
continue
|
77 |
+
try:
|
78 |
+
github = find_github_links(paper['paper']['summary'])
|
79 |
+
except RuntimeError as e:
|
80 |
+
print(e)
|
81 |
+
continue
|
82 |
+
rows.append(pd.Series({
|
83 |
+
'arxiv_id': arxiv_id,
|
84 |
+
'github': github,
|
85 |
+
}))
|
86 |
+
df = pd.DataFrame(rows).reset_index(drop=True)
|
87 |
+
df.to_csv(self.repo_dir / 'papers.csv', index=False)
|
88 |
+
|
89 |
+
def push(self) -> None:
|
90 |
+
self.repo.push_to_hub()
|
91 |
+
|
92 |
+
|
93 |
+
class UpdateScheduler:
|
94 |
+
def __init__(self, space_id: str, cron_hour: str, cron_minute: str):
|
95 |
+
self.space_restarter = SpaceRestarter(space_id=space_id)
|
96 |
+
self.repo_updater = RepoUpdater(repo_id=space_id, repo_type='space')
|
97 |
+
|
98 |
+
self.scheduler = BackgroundScheduler()
|
99 |
+
self.scheduler.add_job(func=self._update,
|
100 |
+
trigger='cron',
|
101 |
+
hour=cron_hour,
|
102 |
+
minute=cron_minute,
|
103 |
+
second=0,
|
104 |
+
timezone='UTC')
|
105 |
+
|
106 |
+
def _update(self) -> None:
|
107 |
+
self.repo_updater.update()
|
108 |
+
if self.repo_updater.repo.is_repo_clean():
|
109 |
+
self.space_restarter.restart()
|
110 |
+
else:
|
111 |
+
self.repo_updater.push()
|
112 |
+
|
113 |
+
def start(self) -> None:
|
114 |
+
self.scheduler.start()
|