papers

Runtime error

App Files Files Community

papers / papers.py

pxiaoer

fix paper

07aaba9 about 1 year ago

raw

history blame

3.09 kB

	import dataclasses
	import datetime
	import operator
	import pathlib

	import pandas as pd
	import requests
	import tqdm.auto


	@dataclasses.dataclass(frozen=True)
	class PaperInfo:
	date: str
	arxiv_id: str
	github: str
	title: str
	paper_page: str
	upvotes: int
	published_at: str

	def __post_init__(self):
	object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at))

	@staticmethod
	def convert_timestamp(timestamp: str) -> str:
	try:
	return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S")
	except ValueError:
	return timestamp


	def get_df(path: pathlib.Path \| str) -> pd.DataFrame:
	df = pd.read_csv(path, dtype=str).fillna("")
	paper_info = []
	for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
	res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
	info = PaperInfo(
	**row,
	title=res["title"],
	paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
	upvotes=res["upvotes"],
	published_at=res["publishedAt"],
	)
	paper_info.append(info)
	return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])


	class Prettifier:
	@staticmethod
	def get_github_link(link: str) -> str:
	if not link:
	return ""
	return Prettifier.create_link("github", link)

	@staticmethod
	def create_link(text: str, url: str) -> str:
	return f'<a href="{url}" target="_blank">{text}</a>'

	@staticmethod
	def to_div(text: str \| None, category_name: str) -> str:
	if text is None:
	text = ""
	class_name = f"{category_name}-{text.lower()}"
	return f'<div class="{class_name}">{text}</div>'

	def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
	df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True)
	new_rows = []
	for _, row in df.iterrows():
	new_row = dict(row) \| {
	"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
	"paper_page": f'<a href="%s" target="_blank">%s</a>' % ("https://arxiv.org/abs/"+row.arxiv_id,row.arxiv_id),
	"github": self.get_github_link(row.github),
	}
	new_rows.append(new_row)
	return pd.DataFrame(new_rows, columns=df.columns)


	class PaperList:
	COLUMN_INFO = [
	["date", "markdown"],
	["paper_page", "markdown"],
	["title", "str"],
	["github", "markdown"],
	["upvotes", "number"],
	]

	def __init__(self, df: pd.DataFrame):
	self.df_raw = df
	self._prettifier = Prettifier()
	self.df_prettified = self._prettifier(df).loc[:, self.column_names]

	@property
	def column_names(self):
	return list(map(operator.itemgetter(0), self.COLUMN_INFO))

	@property
	def column_datatype(self):
	return list(map(operator.itemgetter(1), self.COLUMN_INFO))