|
import dataclasses |
|
import datetime |
|
import operator |
|
import pathlib |
|
|
|
import pandas as pd |
|
import requests |
|
import tqdm.auto |
|
|
|
|
|
@dataclasses.dataclass(frozen=True) |
|
class PaperInfo: |
|
date: str |
|
arxiv_id: str |
|
github: str |
|
title: str |
|
paper_page: str |
|
upvotes: int |
|
published_at: str |
|
|
|
def __post_init__(self): |
|
object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at)) |
|
|
|
@staticmethod |
|
def convert_timestamp(timestamp: str) -> str: |
|
try: |
|
return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S") |
|
except ValueError: |
|
return timestamp |
|
|
|
|
|
def get_df(path: pathlib.Path | str) -> pd.DataFrame: |
|
df = pd.read_csv(path, dtype=str).fillna("") |
|
paper_info = [] |
|
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): |
|
res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json() |
|
info = PaperInfo( |
|
**row, |
|
title=res["title"], |
|
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}", |
|
upvotes=res["upvotes"], |
|
published_at=res["publishedAt"], |
|
) |
|
paper_info.append(info) |
|
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info]) |
|
|
|
|
|
class Prettifier: |
|
@staticmethod |
|
def get_github_link(link: str) -> str: |
|
if not link: |
|
return "" |
|
return Prettifier.create_link("github", link) |
|
|
|
@staticmethod |
|
def create_link(text: str, url: str) -> str: |
|
return f'<a href="{url}" target="_blank">{text}</a>' |
|
|
|
@staticmethod |
|
def to_div(text: str | None, category_name: str) -> str: |
|
if text is None: |
|
text = "" |
|
class_name = f"{category_name}-{text.lower()}" |
|
return f'<div class="{class_name}">{text}</div>' |
|
|
|
def __call__(self, df: pd.DataFrame) -> pd.DataFrame: |
|
df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True) |
|
new_rows = [] |
|
for _, row in df.iterrows(): |
|
new_row = dict(row) | { |
|
"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"), |
|
"paper_page": f'<a href="%s" target="_blank">%s</a>' % ("https://arxiv.org/abs/"+row.arxiv_id,row.arxiv_id), |
|
"github": self.get_github_link(row.github), |
|
} |
|
new_rows.append(new_row) |
|
return pd.DataFrame(new_rows, columns=df.columns) |
|
|
|
|
|
class PaperList: |
|
COLUMN_INFO = [ |
|
["date", "markdown"], |
|
["paper_page", "markdown"], |
|
["title", "str"], |
|
["github", "markdown"], |
|
["upvotes", "number"], |
|
] |
|
|
|
def __init__(self, df: pd.DataFrame): |
|
self.df_raw = df |
|
self._prettifier = Prettifier() |
|
self.df_prettified = self._prettifier(df).loc[:, self.column_names] |
|
|
|
@property |
|
def column_names(self): |
|
return list(map(operator.itemgetter(0), self.COLUMN_INFO)) |
|
|
|
@property |
|
def column_datatype(self): |
|
return list(map(operator.itemgetter(1), self.COLUMN_INFO)) |
|
|