papers

Runtime error

File size: 4,397 Bytes

b4eb3ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1e8757
 
 
b4eb3ca
 
8cbdd6f
b4eb3ca
8cbdd6f
b4eb3ca
8cbdd6f
b4eb3ca
 
 
 
 
 
 
 
 
 
e2797b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4eb3ca
 
8cbdd6f
b4eb3ca
 
 
e2797b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4eb3ca
8cbdd6f
b4eb3ca

import datetime
import pathlib
import re
import tempfile

import pandas as pd
import requests
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi, Repository
from huggingface_hub.utils import RepositoryNotFoundError


class SpaceRestarter:
    def __init__(self, space_id: str):
        self.api = HfApi()
        if self.api.get_token_permission() != 'write':
            raise ValueError('The HF token must have write permission.')
        try:
            self.api.space_info(repo_id=space_id)
        except RepositoryNotFoundError:
            raise ValueError('The Space ID does not exist.')
        self.space_id = space_id

    def restart(self) -> None:
        self.api.restart_space(self.space_id)


def find_github_links(summary: str) -> str:
    links = re.findall(
        r'https://github.com/[^/]+/[^/)}, ]+(?:/(?:tree|blob)/[^/]+/[^/)}, ]+)?',
        summary)
    if len(links) == 0:
        return ''
    if len(links) != 1:
        raise RuntimeError(f'Found multiple GitHub links: {links}')
    link = links[0]
    if link.endswith('.'):
        link = link[:-1]
    link = link.strip()
    return link


class RepoUpdater:
    def __init__(self, repo_id: str, repo_type: str):
        api = HfApi()
        if api.get_token_permission() != 'write':
            raise ValueError('The HF token must have write permission.')

        name = api.whoami()['name']

        repo_dir = pathlib.Path(
            tempfile.tempdir) / repo_id.split('/')[-1]  # type: ignore
        self.csv_path = repo_dir / 'papers.csv'
        self.repo = Repository(
            local_dir=repo_dir,
            clone_from=repo_id,
            repo_type=repo_type,
            git_user=name,
            git_email=f'{name}@users.noreply.huggingface.co')
        self.repo.git_pull()

    def update(self) -> None:
        yesterday = (datetime.datetime.now() -
                     datetime.timedelta(days=1)).strftime('%Y-%m-%d')
        today = datetime.datetime.now().strftime('%Y-%m-%d')
        daily_papers = [
            {
                'date':
                yesterday,
                'papers':
                requests.get(
                    f'https://huggingface.co/api/daily_papers?date={yesterday}'
                ).json()
            },
            {
                'date':
                today,
                'papers':
                requests.get(
                    f'https://huggingface.co/api/daily_papers?date={today}').
                json()
            },
        ]

        self.repo.git_pull()
        df = pd.read_csv(self.csv_path, dtype=str).fillna('')
        rows = [row for _, row in df.iterrows()]
        arxiv_ids = {row.arxiv_id for row in rows}

        for d in daily_papers:
            date = d['date']
            papers = d['papers']
            for paper in papers:
                arxiv_id = paper['paper']['id']
                if arxiv_id in arxiv_ids:
                    continue
                try:
                    github = find_github_links(paper['paper']['summary'])
                except RuntimeError as e:
                    print(e)
                    continue
                rows.append(
                    pd.Series({
                        'date': date,
                        'arxiv_id': arxiv_id,
                        'github': github,
                    }))
        df = pd.DataFrame(rows).reset_index(drop=True)
        df.to_csv(self.csv_path, index=False)

    def push(self) -> None:
        self.repo.push_to_hub()


class UpdateScheduler:
    def __init__(self, space_id: str, cron_hour: str, cron_minute: str):
        self.space_restarter = SpaceRestarter(space_id=space_id)
        self.repo_updater = RepoUpdater(repo_id=space_id, repo_type='space')

        self.scheduler = BackgroundScheduler()
        self.scheduler.add_job(func=self._update,
                               trigger='cron',
                               hour=cron_hour,
                               minute=cron_minute,
                               second=0,
                               timezone='UTC')

    def _update(self) -> None:
        self.repo_updater.update()
        if self.repo_updater.repo.is_repo_clean():
            self.space_restarter.restart()
        else:
            self.repo_updater.push()

    def start(self) -> None:
        self.scheduler.start()