Easy_Cite_Chip / ez_cite.py
zhaoxiang
fix query match bug
aa7ef71
raw
history blame
26 kB
import os
TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")
SEMANTIC_SCHOLAR_API_KEY = os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
import re
import time
import json
import shutil
import requests
import spacy
#!python -m spacy download en_core_web_lg
from openai import OpenAI, APIError
from llama_index import (
VectorStoreIndex,
SimpleDirectoryReader,
ServiceContext,
load_index_from_storage
)
from llama_index.embeddings import HuggingFaceEmbedding, TogetherEmbedding
from llama_index.storage.storage_context import StorageContext
# 处理原始.text数据抹去citation,或者直接从用户出获得没有citation的introduct
def remove_citation(text):
# Regular expression to match \cite{...}
pattern = r'\\cite\{[^}]*\}'
# Replace \cite{...} with an empty string
text = re.sub(pattern, '', text)
# Replace multiple spaces with a single space
text = re.sub(r' +', ' ', text)
# Replace spaces before punctuation marks with just the punctuation marks
text = re.sub(r"\s+([,.!?;:()\[\]{}])", r"\1", text)
return text
def get_chat_completion(client, prompt, llm_model, max_tokens):
messages = [
{
"role": "system",
"content": "You are an AI assistant",
},
{
"role": "user",
"content": prompt,
}
]
try:
chat_completion = client.chat.completions.create(
messages=messages,
model=llm_model,
max_tokens=max_tokens
)
return chat_completion.choices[0].message.content
except APIError as e:
# Handle specific API errors
print(f"API Error: {e}")
except Exception as e:
# Handle other exceptions
print(f"Error: {e}")
def get_relevant_papers(search_query, sort=True, count=10):
"""
search_query (str): the required query parameter and its value (in this case, the keyword we want to search for)
count (int): the number of relevant papers to return for each query
Semantic Scholar Rate limit:
1 request per second for the following endpoints:
/paper/batch
/paper/search
/recommendations
10 requests / second for all other calls
"""
# Define the paper search endpoint URL; All keywords in the search query are matched against the paper’s title and abstract.
url = 'https://api.semanticscholar.org/graph/v1/paper/search'
# Define headers with API key
headers = {'x-api-key': SEMANTIC_SCHOLAR_API_KEY}
query_params = {
'query': search_query,
'fields': 'url,title,year,abstract,authors.name,journal,citationStyles,tldr,referenceCount,citationCount',
'limit': 20,
}
# Send the API request
response = requests.get(url, params=query_params, headers=headers)
# Check response status
if response.status_code == 200:
json_response = response.json()
if json_response['total'] != 0:
papers = json_response['data']
else:
papers = []
# Sort the papers based on citationCount in descending order
if sort:
papers = sorted(papers, key=lambda x: x['citationCount'], reverse=True)
return papers[:count]
else:
print(f"Request failed with status code {response.status_code}: {response.text}")
def save_papers(unique_dir, papers):
os.makedirs(unique_dir, exist_ok=True)
# Save each dictionary to a separate JSON file
for i, dictionary in enumerate(papers):
filename = os.path.join(unique_dir, f"{dictionary['paperId']}.json")
with open(filename, 'w') as json_file:
json.dump(dictionary, json_file, indent=4)
print(f"{len(papers)} papers saved as JSON files successfully at {unique_dir}.")
def get_index(service_context, docs_dir, persist_dir):
documents = SimpleDirectoryReader(docs_dir, filename_as_id=True).load_data()
# check if storage already exists
PERSIST_DIR = persist_dir
if not os.path.exists(PERSIST_DIR):
print('create new index')
index = VectorStoreIndex.from_documents(
documents, service_context=service_context, show_progress=False
)
# store it for later
index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
print('load the existing index')
# load the existing index
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context, service_context=service_context)
# refresh the index
refreshed_docs = index.refresh_ref_docs(documents, update_kwargs={"delete_kwargs": {"delete_from_docstore": True}})
print(f'refreshed_docs:\n{refreshed_docs}')
return index
def get_paper_data(text):
"""text = node.text """
dictionary_from_json = json.loads(text)
bibtex = dictionary_from_json['citationStyles']['bibtex']
bibtex = bibtex.replace('&', 'and')
citation_label = re.findall(r'@(\w+){([\w-]+)', bibtex)[0][1]
citationCount = dictionary_from_json['citationCount']
if dictionary_from_json['tldr'] is not None:
tldr = dictionary_from_json['tldr']['text']
else:
tldr = 'No tldr available'
url = dictionary_from_json['url']
return citation_label, (bibtex, citationCount, tldr, url)
def move_cite_inside_sentence(sent, ez_citation):
if sent[-1]!='\n':
character = sent[-1]
sent_new = sent[:-1] + ' <ez_citation>' + character
else:
count = sent.count('\n')
character = sent[-(count+1)]
sent_new = sent[:-(count+1)] + ' <ez_citation>' + character + '\n'*count
return sent_new.replace('<ez_citation>', ez_citation)
def write_bib_file(bib_file_content, data):
bibtex, citationCount, tldr, url = data
bib_file_content = bib_file_content + f'\n%citationCount: {citationCount}\n%tldr: {tldr}\n%url: {url}\n' + bibtex
return bib_file_content
def write_citation(sent, bib_file_content, retrieved_nodes, sim_threshold=0.75):
labels = []
for node in retrieved_nodes:
citation_label, data = get_paper_data(node.text)
print('relevant paper id (node.id_):', node.id_, 'match score (node.score):', node.score)
print('relevant paper data:', *data)
print('-'*30)
if node.score > sim_threshold and citation_label != "None":
labels.append(citation_label)
if not (citation_label in bib_file_content):
bib_file_content = write_bib_file(bib_file_content, data)
else:
continue
labels = ', '.join(labels)
if labels:
ez_citation = f'\cite{{{labels}}}'
sent_new = move_cite_inside_sentence(sent, ez_citation)
else:
sent_new = sent
return sent_new, bib_file_content
get_prompt = lambda sentence: f"""
I want to use semantic scholar paper search api to find the relevant papers, can you read the following text then suggest me an suitable search query for this task?
Here is an example for using the api:
<example>
```python
import requests
# Define the paper search endpoint URL
url = 'https://api.semanticscholar.org/graph/v1/paper/search'
# Define the required query parameter and its value (in this case, the keyword we want to search for)
query_params = {{
'query': 'semantic scholar platform',
'limit': 3
}}
# Make the GET request with the URL and query parameters
searchResponse = requests.get(url, params=query_params)
```
</example>
Here is the text:
<text>
{sentence}
</text>
"""
# main block
def main(sentences, count, client, llm_model, max_tokens, service_context):
"""count (int): the number of relevant papers to return for each query"""
sentences_new = []
bib_file_content = ''
for sentence in sentences:
prompt = get_prompt(sentence)
response = get_chat_completion(client, prompt, llm_model, max_tokens)
# Define a regular expression pattern to find the value of 'query'
pattern = r"'query': '(.*?)'"
matches = re.findall(pattern, response)
if matches:
search_query = matches[0]
else:
search_query = sentence[:2] # use the first two words as the search query
relevant_papers = get_relevant_papers(search_query, sort=True, count=count)
if relevant_papers:
# save papers to json files and build index
unique_dir = os.path.join("papers", f"{int(time.time())}")
persist_dir = os.path.join("index", f"{int(time.time())}")
save_papers(unique_dir, relevant_papers)
index = get_index(service_context, unique_dir, persist_dir)
# get sentence's most similar papers
retriever = index.as_retriever(service_context=service_context, similarity_top_k=5)
retrieved_nodes = retriever.retrieve(sentence)
sent_new, bib_file_content = write_citation(sentence, bib_file_content, retrieved_nodes, sim_threshold=0.7)
sentences_new.append(sent_new)
else:
sentences_new.append(sentence)
print('sentence:', sentence.strip())
print('search_query:', search_query)
print('='*30)
return sentences_new, bib_file_content
def ez_cite(introduction, debug=False):
nlp = spacy.load("en_core_web_lg")
doc = nlp(introduction)
sentences = [sentence.text for sentence in doc.sents]
sentences = [ remove_citation(sentence) for sentence in sentences]
client = OpenAI(api_key=TOGETHER_API_KEY,
base_url='https://api.together.xyz',
)
llm_model = "Qwen/Qwen1.5-72B-Chat"
max_tokens = 1000
embed_model = TogetherEmbedding(model_name="togethercomputer/m2-bert-80M-8k-retrieval", api_key=TOGETHER_API_KEY)
service_context = ServiceContext.from_defaults(
llm=None, embed_model=embed_model, chunk_size=8192, # chunk_size must be bigger than the whole .json so that all info is preserved, in this case, one doc is one node
)
if debug:
sentences = sentences[:2]
sentences_new, bib_file_content = main(sentences, count=10,
client=client,
llm_model=llm_model,
max_tokens=max_tokens,
service_context=service_context)
with open('intro.bib', 'w') as bib_file:
bib_file.write(bib_file_content)
final_intro = ' '.join(sentences_new)
print(final_intro)
print('='*30)
dir_path = "index"
try:
# Delete the directory and its contents
shutil.rmtree(dir_path)
print(f"Directory '{dir_path}' deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{dir_path}': {e}")
dir_path = "papers"
try:
# Delete the directory and its contents
shutil.rmtree(dir_path)
print(f"Directory '{dir_path}' deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{dir_path}': {e}")
return final_intro, bib_file_content
# arXiv:2209.05523v2
introduction = r"""A long-standing paradigm in machine learning is the trade-off between the complexity of a model family and the model's ability to generalize: more expressive model classes contain better candidates to fit complex trends in data, but are also prone to overfitting noise \cite{nielsen2015neural, geman1992neural}. \textit{Interpolation}, defined for our purposes as choosing a model with zero training error, was hence long considered bad practice \cite{hastie2009elements}. The success of deep learning - machine learning in a specific regime of extremely complex model families with vast amounts of tunable parameters - seems to contradict this notion; here, consistent evidence shows that among some interpolating models, more complexity tends \textit{not to harm} the generalisation performance, a phenomenon described as "benign overfitting" \cite{bartlett2021deep}.
In recent years, a surge of theoretical studies have reproduced benign overfitting in simplified settings with the hope of isolating the essential ingredients of the phenomenon \cite{bartlett2021deep, belkin2021fit}. For example, Ref. \cite{bartlett2020benign} showed how interpolating linear models in a high complexity regime (more dimensions than datapoints) could generalize just as well as their lower-complexity counterparts on new data, and analyzed the properties of the data that lead to the "absorption" of noise by the interpolating model without harming the model's predictions. Ref. \cite{belkin2019reconciling} showed that there are model classes of simple functions that change quickly in the vicinity of the noisy training data, but recover a smooth trend elsewhere in data space (see Figure 1). Such functions have also been used to train nearest neighbor models that perfectly overfit training data while generalizing well, thereby directly linking "spiking models" to benign overfitting \cite{belkin2019does}. Recent works try to recover the basic mechanism of such spiking models using the language of Fourier analysis \cite{muthukumar2020harmless, muthukumar2021classification, dar2021farewell}.
In parallel to these exciting developments in the theory of deep learning, quantum computing researchers have proposed families of parametrised quantum algorithms as model classes for machine learning (e.g. Ref. \cite{benedetti2019parameterized}). These quantum models can be optimised similarly to neural networks \cite{mitarai2018quantum, schuld2019evaluating} and have interesting parallels to kernel methods \cite{schuld2019quantum, havlivcek2019supervised} and generative models \cite{lloyd2018quantum, dallaire2018quantum}. Although researchers have taken some first steps to study the expressivity \cite{abbas2021power, wright2020capacity, sim2019expressibility, hubregtsen2021evaluation}, trainability \cite{mcclean2018barren, cerezo2021cost} and generalisation \cite{caro2021encoding, huang_power_2021, caro2022generalization, banchi2021generalization} of quantum models, we still know relatively little about their behaviour. In particular, the interplay of overparametrisation, interpolation, and generalisation that seems so important for deep learning is yet largely unexplored.
In this paper we develop a simplified framework in which questions of overfitting in quantum machine learning can be investigated. Essentially, we exploit the observation that quantum models can often be described in terms of Fourier series where well-defined components of the quantum circuit influence the selection of Fourier modes and their respective Fourier coefficients \cite{gil2020input, schuld2021effect, wierichs2022general}. We link this description to the analysis of spiking models and benign overfitting by building on prior works analyzing these phenomena using Fourier methods. In this approach, the complexity of a model is related to the number of Fourier modes that its Fourier series representation consists of, and overparametrised model classes have more modes than needed to interpolate the training data (i.e., to have zero training error). After deriving the generalization error for such model classes these "superfluous" modes lead to spiking models, which have large oscillations around the training data while keeping a smooth trend everywhere else. However, large numbers of modes can also harm the recovery of an underlying signal, and we therefore balance this trade-off to produce an explicit example of benign overfitting in a quantum machine learning model.
The mathematical link described above allows us to probe the impact of important design choices for a simplified class of quantum models on this trade-off. For example, we find why a measure of redundancy in the spectrum of the Hamiltonian that defines standard data encoding strategies strongly influences this balance; in fact to an extent that is difficult to counterbalance by other design choices of the circuit.
The remainder of the paper proceeds as follows. We will first review the classical Fourier framework for the study of interpolating models and develop explicit formulae for the error in these models to produce a basic example of benign overfitting (Sec. 2). We will then construct a quantum model with analogous components to the classical model, and demonstrate how each of these components is related to the structure of the corresponding quantum circuit and measurement (Sec. 3). We then analyze specific cases that give rise to "spikiness" and benign overfitting in these quantum models (Sec. 3.2)."""
# # arXiv:2302.01365v3
# introduction = r"""In order for a learning model to generalise well from training data, it is often crucial to encode some knowledge about the structure of the data into the model itself. Convolutional neural networks are a classic illustration of this principle, whose success at image related tasks is often credited to the existence of model structures that relate to label invariance of the data under translation symmetries. Together with the choice of loss function and hyperparameters, these structures form part of the basic assumptions that a learning model makes about the data, which is commonly referred to as the _inductive bias_ of the model.
# One of the central challenges facing quantum machine learning is to identify data structures that can be encoded usefully into quantum learning models; in other words, what are the forms of inductive bias that naturally lend themselves to quantum computation? In answering this question, we should be wary of hoping for a one-size-fits-all approach in whichquantum models outperform neural network models at generic learning tasks. Rather, effort should be placed in understanding how the Hilbert space structure and probabilistic nature of the theory suggest particular biases for which quantum machine learning may excel. Indeed, an analogous perspective is commonplace in quantum computation, where computational advantages are expected only for specific problems that happen to benefit from the peculiarities of quantum logic.
# In the absence of large quantum computers and in the infancy of quantum machine learning theory, how should we look for insight on this issue? One possibility is to turn to complexity theory, where asymptotic advantages of quantum learning algorithms have been proven. These results are few and far between however, and the enormous gap between what is possible to prove in a complexity-theoretic sense, and the types of advantages that may be possible in practice, means that there are growing doubts about the practical relevance of these results. Indeed, progress in machine learning is often the result of good ideas built on intuition, rather than worst-case complexity theoretic analysis. To repeat a common quip: many problems in machine learning are NP-hard, but neural networks don't know that so they solve them anyway.
# We will take a different route, and lean on the field of quantum foundations to guide us. Quantum foundations is predominantly concerned with understanding the frontier between the quantum and classical world, and typically values a clear qualitative understanding of a phenomenon over purely mathematical knowledge. For these reasons it is well suited to identify features of quantum theory that may advance quantum machine learning in useful directions. In particular, we focus on the phenomenon of contextuality, which is perhaps the most prominent form of nonclassicality studied in the literature. Contextuality has a considerable tradition of being studied in relation to quantum computation, where it is closely connected to the possibility of computational speed-up. Despite this, it has had relatively little attention in quantum machine learning, with only a couple of works linking contextuality to implications for learning.
# We adopt a notion of contextuality called \textit{generalised contextuality}, introduced by Spekkens in 2004. Loosely speaking, it refers to the fact that (i) there are different experimental procedures (called contexts) in the theory that are indistinguishable1, and (ii) any statistical model that reproduces the predictions of the theory must take these contexts into account. With this choice, our first task will then be to introduce a framework to talk about generalised contextuality in machine learning (Section 2). This was missing in previous works, which prove consequences for learning based on phenomena originating from contextuality, but do not attempt to define a notion of contextuality for machine learning that captures a wide range of models. Our general philosophy will be that the framework should depend purely on what a learning model can do, and not on the details of how it does it; i.e., the framework should be independent of the theory on which the models are built. This is necessary to have a framework that treats quantum and classical algorithms on the same footing, and ultimately involves adopting definitions in a similar spirit to the notion of operational contextuality as recently described in.
# We mostly focus on a paradigm of machine learning called multi-task learning, in which the aim is to simultaneously learn a number of separate models for a collection of different (but typically correlated) tasks. Multi-task learning scenarios are conceptually similar to commonly studied contextuality scenarios, and this similarity leads us to a definition of what it means for a multi-task model to be contextual (Section 3). Although the focus on multi-class learning problems appears restrictive, as the separation between tasks is arbitrary at the mathematical level, we also arrive at a notion of contextuality in the single task setting (Section 6). In particular, we argue that it makes sense to think of contextuality as a property relative to a particular inductive bias of a model, rather than a property of the model as a whole.
# Once we have described our framework, our second task will be to identify specific learning problems for which contextuality plays a role (Section 4). We show that this is the case when learning probabilistic models from data sets which feature a linearly conserved quantity in a discrete label space (see Figure 1). Such data sets can arise naturally from experiments involving conserved quantities, zero-sum game scenarios, logistics with conserved resources, substance diffusion in biological systems, and human mobility and migration. We show that the ability of a model to encode the conserved quantity as an inductive bias directly links to a central concept in generalised contextuality, called \textit{operational equivalence}. This results in a constraint on noncontextual learning models that encode the desired bias, which amounts to a limit on the expressivity of noncontextual model classes. For certain data sets, this limitation can negatively impact generalisation performance due to the lack of a suitable model within the class that matches the underlying data distribution; in such cases contextuality may therefore be required for learning. To illustrate this point, in Section 5 we construct a toy problem based on the rock, paper, scissors zero-sum game and prove precise limits on the expressivity of noncontextual model classes that attempt to learn the payoff behaviour of the game.
# In the final part of the work, we study the performance of quantum models for problems that involve our contextuality-inspired bias (Section 7). We first describe two approaches to construct quantum ansatze encoding the bias. The first of these encodes the bias into the state structure of the ansatz, and exploits tools from geometric quantum machine learning. The second approach encodes the bias into the measurement structure, and we present a new family of measurements to this end that may be of independent interest. We then use these tools in a simple numerical investigation (Section 8), inspired by a recent work of Schreiber et al.. Using the fact that quantum machine learning models are equivalent to truncated Fourier series, the authors of define the notion of a classical surrogate model: a linear Fourier features model that has access to the same frequencies of the quantum model, but which lacks its specific inductive bias. The authors found that classical surrogate model classes perform better than quantum model classes on a wide range of regression tasks, the message being that it is still unclear what the inductive bias of quantum machine learning is useful for. In our numerical study, we show that a quantum model class that encodes our contextuality-inspired bias achieves a lower generalisation error than the corresponding surrogate model classes at a specific learning task, even after allowing for regularisation in the surrogate model. We argue that this is due to the fact that the bias cannot be easily encoded into the surrogate model class, which therefore cannot exploit this information during learning.
# In Section 9 we elaborate on a number of areas where contextuality-inspired inductive bias can be expected to play a role in learning. Many of these areas are classical in nature, and therefore suggests that quantum machine learning may be suited to tackling classical learning problems with a specific structure. Finally, in Section 10, we outline our vision for this line of research and the possible next steps to take. Overall, we hope our approach and framework will lead to a new way of thinking about quantum machine learning, and ultimately lead to the identification of impactful problems where the specific structure of quantum theory makes quantum models the machine learning models of choice."""
if __name__ == "__main__":
final_intro, bib_file_content = ez_cite(introduction, debug=True)