papers

Runtime error

File size: 3,028 Bytes

08080f2

import dataclasses
import datetime
import operator
import pathlib

import pandas as pd
import requests
import tqdm.auto


@dataclasses.dataclass(frozen=True)
class PaperInfo:
    arxiv_id: str
    published_at: str
    github: str
    title: str
    paper_page: str
    upvotes: int

    def __post_init__(self):
        object.__setattr__(self, 'published_at',
                           PaperInfo.convert_timestamp(self.published_at))

    @staticmethod
    def convert_timestamp(timestamp: str) -> str:
        try:
            return datetime.datetime.strptime(
                timestamp,
                '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y/%m/%d %H:%M:%S')
        except ValueError:
            return timestamp


def get_df(path: pathlib.Path | str) -> pd.DataFrame:
    df = pd.read_csv(path, dtype=str).fillna('')
    paper_info = []
    for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
        res = requests.get(
            f'https://huggingface.co/api/papers/{row.arxiv_id}').json()
        info = PaperInfo(
            **row,
            title=res['title'],
            paper_page=f'https://huggingface.co/papers/{row.arxiv_id}',
            upvotes=res['upvotes'])
        paper_info.append(info)
    return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])


class Prettifier:
    @staticmethod
    def get_paper_page_link(link: str) -> str:
        return Prettifier.create_link(link.split('/')[-1], link)

    @staticmethod
    def get_github_link(link: str) -> str:
        if not link:
            return ''
        return Prettifier.create_link('github', link)

    @staticmethod
    def create_link(text: str, url: str) -> str:
        return f'<a href={url} target="_blank">{text}</a>'

    @staticmethod
    def to_div(text: str | None, category_name: str) -> str:
        if text is None:
            text = ''
        class_name = f'{category_name}-{text.lower()}'
        return f'<div class="{class_name}">{text}</div>'

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.sort_values('arxiv_id', ascending=False).reset_index(drop=True)
        new_rows = []
        for _, row in df.iterrows():
            new_row = dict(row) | {
                'paper_page': self.get_paper_page_link(row.paper_page),
                'github': self.get_github_link(row.github),
            }
            new_rows.append(new_row)
        return pd.DataFrame(new_rows, columns=df.columns)


class PaperList:
    COLUMN_INFO = [
        ['paper_page', 'markdown'],
        ['title', 'str'],
        ['github', 'markdown'],
        ['upvotes', 'number'],
    ]

    def __init__(self, df: pd.DataFrame):
        self.df_raw = df
        self._prettifier = Prettifier()
        self.df_prettified = self._prettifier(df).loc[:, self.column_names]

    @property
    def column_names(self):
        return list(map(operator.itemgetter(0), self.COLUMN_INFO))

    @property
    def column_datatype(self):
        return list(map(operator.itemgetter(1), self.COLUMN_INFO))