File size: 3,086 Bytes
08080f2 e2797b8 08080f2 857ce49 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 e2797b8 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 9fb4b90 2b2c6a9 9fb4b90 08080f2 c0db5be 6414061 c0db5be 9fb4b90 6414061 08080f2 07aaba9 08080f2 07aaba9 08080f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import dataclasses
import datetime
import operator
import pathlib
import pandas as pd
import requests
import tqdm.auto
@dataclasses.dataclass(frozen=True)
class PaperInfo:
date: str
arxiv_id: str
github: str
title: str
paper_page: str
upvotes: int
published_at: str
def __post_init__(self):
object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at))
@staticmethod
def convert_timestamp(timestamp: str) -> str:
try:
return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S")
except ValueError:
return timestamp
def get_df(path: pathlib.Path | str) -> pd.DataFrame:
df = pd.read_csv(path, dtype=str).fillna("")
paper_info = []
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
info = PaperInfo(
**row,
title=res["title"],
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
upvotes=res["upvotes"],
published_at=res["publishedAt"],
)
paper_info.append(info)
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
class Prettifier:
@staticmethod
def get_github_link(link: str) -> str:
if not link:
return ""
return Prettifier.create_link("github", link)
@staticmethod
def create_link(text: str, url: str) -> str:
return f'<a href="{url}" target="_blank">{text}</a>'
@staticmethod
def to_div(text: str | None, category_name: str) -> str:
if text is None:
text = ""
class_name = f"{category_name}-{text.lower()}"
return f'<div class="{class_name}">{text}</div>'
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True)
new_rows = []
for _, row in df.iterrows():
new_row = dict(row) | {
"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
"paper_page": f'<a href="%s" target="_blank">%s</a>' % ("https://arxiv.org/abs/"+row.arxiv_id,row.arxiv_id),
"github": self.get_github_link(row.github),
}
new_rows.append(new_row)
return pd.DataFrame(new_rows, columns=df.columns)
class PaperList:
COLUMN_INFO = [
["date", "markdown"],
["paper_page", "markdown"],
["title", "str"],
["github", "markdown"],
["upvotes", "number"],
]
def __init__(self, df: pd.DataFrame):
self.df_raw = df
self._prettifier = Prettifier()
self.df_prettified = self._prettifier(df).loc[:, self.column_names]
@property
def column_names(self):
return list(map(operator.itemgetter(0), self.COLUMN_INFO))
@property
def column_datatype(self):
return list(map(operator.itemgetter(1), self.COLUMN_INFO))
|