Spaces:

du-lab
/

MLR-Copilot

Runtime error

App Files Files Community

Lim0011 commited on Aug 30

Commit

85e3d20

•

1 Parent(s): f13745f

Upload 251 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +18 -0
README.md +5 -6
app.py +743 -0
benchmarks/CLRS/env/__init__.py +14 -0
benchmarks/CLRS/env/baseline_model_description.txt +507 -0
benchmarks/CLRS/env/baselines.py +794 -0
benchmarks/CLRS/env/baselines_test.py +294 -0
benchmarks/CLRS/env/data_description.txt +35 -0
benchmarks/CLRS/env/dataset.py +326 -0
benchmarks/CLRS/env/dataset_test.py +116 -0
benchmarks/CLRS/env/decoders.py +381 -0
benchmarks/CLRS/env/decoders_test.py +47 -0
benchmarks/CLRS/env/encoders.py +139 -0
benchmarks/CLRS/env/evaluation.py +202 -0
benchmarks/CLRS/env/evaluation_test.py +55 -0
benchmarks/CLRS/env/losses.py +209 -0
benchmarks/CLRS/env/losses_test.py +166 -0
benchmarks/CLRS/env/model.py +46 -0
benchmarks/CLRS/env/nets.py +719 -0
benchmarks/CLRS/env/probing.py +351 -0
benchmarks/CLRS/env/probing_test.py +192 -0
benchmarks/CLRS/env/processors.py +856 -0
benchmarks/CLRS/env/processors_test.py +64 -0
benchmarks/CLRS/env/samplers.py +882 -0
benchmarks/CLRS/env/samplers_test.py +250 -0
benchmarks/CLRS/env/specs.py +525 -0
benchmarks/CLRS/env/train.py +560 -0
benchmarks/CLRS/scripts/eval.py +454 -0
benchmarks/CLRS/scripts/requirements.txt +13 -0
benchmarks/CLRS/scripts/research_problem.txt +3 -0
benchmarks/CLRS/scripts/source_code.txt +1 -0
benchmarks/amp-parkinsons-disease-progression-prediction/env/data_description.txt +33 -0
benchmarks/amp-parkinsons-disease-progression-prediction/env/evaluation_details.txt +12 -0
benchmarks/amp-parkinsons-disease-progression-prediction/env/public_timeseries_testing_util.py +94 -0
benchmarks/amp-parkinsons-disease-progression-prediction/env/train.py +141 -0
benchmarks/amp-parkinsons-disease-progression-prediction/scripts/eval.py +21 -0
benchmarks/amp-parkinsons-disease-progression-prediction/scripts/prepare.py +79 -0
benchmarks/amp-parkinsons-disease-progression-prediction/scripts/read_only_files.txt +5 -0
benchmarks/amp-parkinsons-disease-progression-prediction/scripts/research_problem.txt +3 -0
benchmarks/amp-parkinsons-disease-progression-prediction/scripts/source_code.txt +2 -0
benchmarks/babylm/env/babyLM_for_hf.py +104 -0
benchmarks/babylm/env/train.py +641 -0
benchmarks/babylm/scripts/eval.py +212 -0
benchmarks/babylm/scripts/prepare.py +11 -0
benchmarks/babylm/scripts/read_only_files.txt +2 -0
benchmarks/babylm/scripts/research_problem.txt +7 -0
benchmarks/bibtex-generation/env/arxiv_API_reference.txt +599 -0
benchmarks/bibtex-generation/env/bibtex_generation.py +0 -0
benchmarks/bibtex-generation/env/claude_example.py +11 -0
benchmarks/bibtex-generation/env/google_scholar_API_reference.txt +153 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM anibali/pytorch:2.0.0-cuda11.8-ubuntu22.04
+# Set up time zone.
+ENV TZ=UTC
+RUN sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
+USER root
+RUN apt update && apt install -y gcc-10 g++-10 && ln /usr/bin/gcc-10 /usr/bin/gcc && ln /usr/bin/g++-10 /usr/bin/g++ && apt install -y zlib1g-dev && rm -r /var/lib/apt/lists/*
+# copy files
+WORKDIR /app
+COPY . .
+# Install libraries
+RUN python3 -m pip install -r requirements.txt
+# start bash shell
+CMD bash

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
-title: MLR Copilot
-emoji: 😻
-colorFrom: green
-colorTo: blue
 sdk: gradio
-sdk_version: 4.42.0
 app_file: app.py
 pinned: false
-license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: New Test Autoresearch
+emoji: 📉
+colorFrom: blue
+colorTo: gray
 sdk: gradio
+sdk_version: 4.41.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,743 @@

+import random
+import gradio as gr
+from pathlib import Path
+from reactagent.environment import Environment
+from reactagent.agents.agent_research import ResearchAgent
+from reactagent.runner import create_parser
+from reactagent import llm
+from reactagent.users.user import User
+# Global variables to store session state
+env = None
+agent = None
+# Predefined research paper text (example)
+predefined_paper_text = """
+    Title:
+    Dataset and Baseline for Automatic Student Feedback Analysis
+    Abstract:
+    This paper presents a student feedback corpus containing 3000 instances of feedback written by university students. The dataset has been annotated for aspect terms, opinion terms, polarities of the opinion terms towards targeted aspects, document-level opinion polarities, and sentence separations. A hierarchical taxonomy for aspect categorization covering all areas of the teaching-learning process was developed. Both implicit and explicit aspects were annotated using this taxonomy. The paper discusses the annotation methodology, difficulties faced during the annotation, and details about aspect term categorization. The annotated corpus can be used for Aspect Extraction, Aspect Level Sentiment Analysis, and Document Level Sentiment Analysis. Baseline results for all three tasks are provided.
+"""
+# Predefined extracted elements based on the paper text
+predefined_research_tasks = "The primary research tasks include the creation of a comprehensive student feedback corpus, aspect term annotation, opinion polarity annotation, and the development of a hierarchical taxonomy."
+predefined_research_gaps = "Gaps include the lack of detailed aspect-level annotations in existing datasets and the focus on document-level sentiment analysis."
+predefined_keywords = "Student Feedback Corpus, Aspect Terms, Opinion Terms, Polarity, Hierarchical Taxonomy, Aspect Extraction, Aspect Level Sentiment Analysis, Document Level Sentiment Analysis"
+predefined_recent_works = """
+    1. "Students feedback analysis model using deep learning-based method and linguistic knowledge for intelligent educational systems."
+    2. "An Automated Approach for Analysing Students Feedback Using Sentiment Analysis Techniques."
+    """
+# Extraction function to simulate the extraction of Research Tasks (t), Research Gaps (g), Keywords (k), and Recent Works (R)
+def extract_research_elements(paper_text):
+    # Returning the predefined extracted content
+    return predefined_research_tasks, predefined_research_gaps, predefined_keywords, predefined_recent_works
+# Generation function for Research Hypothesis and Experiment Plan
+def generate_research_idea_and_plan(tasks, gaps, keywords, recent_works):
+    hypothesis = f"""
+    Method: Advanced Aspect-Level Sentiment Analysis of Student Feedback Using a Hybrid Deep Learning Approach
+    Step 1: Dataset Enhancement
+    Data Collection and Preprocessing
+    * Collect additional student feedback from multiple universities to expand the existing dataset.
+    * Preprocess the data to ensure uniformity in annotation and eliminate noise, such as redundant information and grammatical errors.
+    Annotation Refinement
+    * Use advanced NLP techniques to further refine the aspect terms, opinion terms, and polarities.
+    * Incorporate semi-supervised learning methods to improve annotation accuracy, utilizing both manual and automated processes.
+    Step 2: Model Development
+    Hybrid Model Architecture
+    * Develop a hybrid model that integrates CNN, BiLSTM, and attention mechanisms, similar to the DTLP approach mentioned in the recent work by DTLP (Deep Learning and Teaching Process).
+    * Incorporate a Transformer-based model (like BERT) to capture contextual nuances and improve the understanding of implicit aspects.
+    Feature Integration
+    * Enhance the feature set by combining statistical, linguistic, and sentiment knowledge features with word embeddings.
+    * Include sentiment shifter rules and contextual polarity indicators to address challenges in sentiment analysis.
+    Step 3: Training and Validation
+    Model Training
+    * Train the hybrid model using the enhanced dataset.
+    * Use cross-validation techniques to ensure robustness and prevent overfitting.
+    Baseline Comparisons
+    * Compare the model's performance with baseline results provided in the original study and other recent works.
+    * Use metrics such as accuracy, precision, recall, and F1-score to evaluate model performance across different tasks, including Aspect Extraction, Aspect Level Sentiment Analysis, and Document Level Sentiment Analysis.
+    Step 4: Iterative Refinement
+    Feedback Loop
+    * Implement an iterative feedback loop where the model's predictions are reviewed and corrected, improving the model iteratively.
+    * Engage domain experts in the review process to ensure the relevance and accuracy of the feedback. Continuous Learning
+    * Utilize active learning techniques to continuously update the model with new data, ensuring it remains up-to-date with current trends in student feedback.
+    Step 5: Deployment and Application
+    Integration with Educational Systems
+    * Deploy the model as a part of an intelligent educational system to analyze student feedback in real-time.
+    * Provide actionable insights to educators and administrators to improve teaching methods and curriculum design. User Interface Development
+    * Develop an intuitive user interface that allows educators to interact with the model, view feedback analysis, and generate reports.
+    """
+    experiment_plan = f"""
+    Experiment: Validating the Hybrid Deep Learning Approach for Aspect-Level Sentiment Analysis of Student Feedback
+    Objective:
+    To validate the effectiveness of the proposed hybrid deep learning approach (combining CNN, BiLSTM, and Transformer models) for aspect-level sentiment analysis of student feedback by comparing its performance with baseline methods and recent works.
+    Research Problem:
+    Current sentiment analysis models for student feedback lack detailed aspect-level annotations and fail to address implicit aspects and contextual nuances in feedback data.
+    Proposed Method:
+    A hybrid deep learning model integrating CNN, BiLSTM, and Transformer-based models (like BERT) to enhance aspect-level sentiment analysis. The method incorporates sentiment shifter rules and contextual polarity indicators to address challenges in sentiment analysis.
+    Experiment Design:
+    1. Dataset Preparation:
+        * Existing Dataset: Use the dataset provided by Herath et al. (2022) with 3000 instances of student feedback, annotated for aspect terms, opinion terms, polarities, and document-level sentiments.
+        * Data Augmentation: Expand the dataset by collecting additional feedback from multiple universities, ensuring diversity in feedback data.
+    2. Preprocessing:
+        * Clean the data to remove noise and inconsistencies.
+        * Tokenize the text and apply part-of-speech tagging.
+        * Annotate additional feedback instances using the refined hierarchical taxonomy.
+    3. Model Training:
+        * Baseline Models: Implement and train traditional machine learning models (e.g., SVM, Naive Bayes) and existing deep learning models (e.g., LSTM, BiLSTM) for sentiment analysis.
+        * Proposed Hybrid Model: Train the proposed hybrid model combining CNN, BiLSTM, and Transformer (BERT) layers. Use pre-trained embeddings and fine-tune on the feedback dataset.
+    4. Feature Extraction:
+        * Extract features using word embeddings, sentiment shifter rules, and contextual polarity indicators.
+        * Integrate statistical, linguistic, and sentiment knowledge features with word embeddings to form a comprehensive feature set.
+    5. Evaluation Metrics:
+        * Measure the performance of models using accuracy, precision, recall, and F1-score.
+        * Perform aspect-level evaluation by analyzing the accuracy of aspect term extraction and sentiment classification.
+    6. Experiment Execution:
+        * Training Phase: Train the baseline models and the proposed hybrid model on the training dataset.
+        * Validation Phase: Validate the models using cross-validation techniques to ensure robustness and prevent overfitting.
+        * Testing Phase: Evaluate the models on a held-out test set to compare their performance.
+    7. Comparison and Analysis:
+        * Compare the performance of the proposed hybrid model with baseline models and recent works, such as DTLP and other sentiment analysis techniques.
+        * Analyze the results to identify strengths and weaknesses of the proposed model in handling aspect-level sentiment analysis and implicit aspects.
+    8. Iterative Refinement:
+        * Implement an iterative feedback loop where predictions are reviewed and corrected, improving model performance over iterations.
+        * Engage domain experts to review the model's predictions and provide feedback for further refinement.
+    9. Deployment:
+        * Integrate the validated model into an intelligent educational system for real-time feedback analysis.
+        * Develop a user interface to allow educators to interact with the model, view feedback analysis, and generate reports.
+    """
+    return hypothesis, experiment_plan
+predefined_action_log = """
+[Reasoning]: To understand the initial structure and functionality of train.py for effective improvements.
+[Action]: Inspect Script (train.py)
+Input: {"script_name": "train.py", "start_line_number": "1", "end_line_number": "74"}
+Objective: Understand the training script, including data processing, [...]
+[Observation]: The train.py script imports [...]. Sets random seeds [...]. Defines [...] Placeholder functions [...] exist without implementation. [...]
+[Feedback]: The script structure is clear, but key functions (train_model, predict) need proper implementation for proposed model training and prediction.
+"""
+predefined_response = """
+[Reasoning]: Execute the "final_model.py" using ExecuteScript action to evaluate performance of the final model.
+[Action]: Execute "final_model.py" using ExecuteScript action.
+Input: {"script_name": "final_model.py"}
+"""
+predefined_observation = """
+Epoch [1/10],
+Train MSE: 0.543,
+Test MSE: 0.688
+Epoch [2/10],
+Train MSE: 0.242,
+Test MSE: 0.493
+"""
+# # Structured input as list of dictionaries
+# process_steps = [
+#     "Action: Inspect Script Lines (train.py)\nObservation: The train.py script imports necessary libraries (e.g., pandas, sklearn, torch). Sets random seeds for reproducibility. Defines compute_metrics_for_regression function to calculate RMSE for different dimensions. Placeholder functions train_model and predict exist without implementations.\nFeedback: The script structure is clear, but key functions (train_model, predict) need proper implementation for proposed model training and prediction.",
+#     "Action: Execute Script (train.py)\nObservation: The script executed successfully. Generated embeddings using the BERT model. Completed the training process without errors. Metrics calculation placeholders indicated areas needing implementation.\nFeedback: Experimental model definition and training logic are missing.",
+#     "Action: Edit Script (train.py)\nObservation: Edited train.py to separate data loading, model definition, training loop, and evaluation into distinct functions. The edited train.py now has clearly defined functions for data loading (load_data), model definition (build_model), training (train_model), and evaluation (evaluate_model). Similarly, eval.py is reorganized to load the model and perform predictions efficiently.\nFeedback: Modify model architecture, retrieve the hybrid model of CNN, BiLSTM, and attention mechanisms, similar to the DTLP to align with the experiment design.",
+#     "Action: Retrieve Model\nObservation: CNN and BiLSTM retrieved.\nFeedback: Modify the model architecture.",
+#     "Action: Execute Script (train.py)\nObservation: The model trained over the specified number of epochs. Training and validation loss values are recorded for each epoch, the decrease in loss indicates improved model performance.\nFeedback: Continue with the next steps in model evaluation.",
+#     predefined_observation
+# ]
+action_list = [
+    predefined_response,
+    predefined_observation
+]
+# Predefined code to display in Phase 2
+predefined_code = """import pandas as pd
+from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
+import numpy as np
+import random
+import torch
+from sklearn.model_selection import train_test_split
+DIMENSIONS = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+np.random.seed(SEED)
+def compute_metrics_for_regression(y_test, y_test_pred):
+    metrics = {}
+    for task in DIMENSIONS:
+        targets_task = [t[DIMENSIONS.index(task)] for t in y_test]
+        pred_task = [l[DIMENSIONS.index(task)] for l in y_test_pred]
+        rmse = mean_squared_error(targets_task, pred_task, squared=False)
+        metrics[f"rmse_{task}"] = rmse
+    return metrics
+def train_model(X_train, y_train, X_valid, y_valid):
+    model = None  # Placeholder for model training
+    return model
+def predict(model, X):
+    y_pred = np.random.rand(len(X), len(DIMENSIONS))
+    return y_pred
+if __name__ == '__main__':
+    ellipse_df = pd.read_csv('train.csv',
+                            header=0, names=['text_id', 'full_text', 'Cohesion', 'Syntax',
+                            'Vocabulary', 'Phraseology','Grammar', 'Conventions'],
+                            index_col='text_id')
+    ellipse_df = ellipse_df.dropna(axis=0)
+    data_df = ellipse_df
+    X = list(data_df.full_text.to_numpy())
+    y = np.array([data_df.drop(['full_text'], axis=1).iloc[i] for i in range(len(X))])
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED)
+    model = train_model(X_train, y_train, X_valid, y_valid)
+    y_valid_pred = predict(model, X_valid)
+    metrics = compute_metrics_for_regression(y_valid, y_valid_pred)
+    print(metrics)
+    print("final MCRMSE on validation set: ", np.mean(list(metrics.values())))
+    submission_df = pd.read_csv('test.csv',  header=0, names=['text_id', 'full_text'], index_col='text_id')
+    X_submission = list(submission_df.full_text.to_numpy())
+    y_submission = predict(model, X_submission)
+    submission_df = pd.DataFrame(y_submission, columns=DIMENSIONS)
+    submission_df.index = submission_df.index.rename('text_id')
+    submission_df.to_csv('submission.csv')
+"""
+final_code = """
+* Resulting train.py:
+import pandas as pd
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+from transformers import BertTokenizer, BertModel
+# Define constants
+DIMENSIONS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
+class EssayDataset(Dataset):
+    def __init__(self, texts, targets, tokenizer, max_len):
+        self.texts = texts
+        self.targets = targets
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, item):
+        text = self.texts[item]
+        target = self.targets[item]
+        encoding = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            return_token_type_ids=False,
+            padding='max_length',
+            return_attention_mask=True,
+            return_tensors='pt',
+            truncation=True
+        )
+        return {
+            'text': text,
+            'input_ids': encoding['input_ids'].flatten(),
+            'attention_mask': encoding['attention_mask'].flatten(),
+            'targets': torch.tensor(target, dtype=torch.float)
+        }
+class EssayScoreRegressor(nn.Module):
+    def __init__(self, n_outputs):
+        super(EssayScoreRegressor, self).__init__()
+        self.bert = BertModel.from_pretrained('bert-base-uncased')
+        self.drop = nn.Dropout(p=0.3)
+        self.out = nn.Linear(self.bert.config.hidden_size, n_outputs)
+    def forward(self, input_ids, attention_mask):
+        pooled_output = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )['pooler_output']
+        output = self.drop(pooled_output)
+        return self.out(output)
+def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
+    model = model.train()
+    losses = []
+    for d in data_loader:
+        input_ids = d['input_ids'].to(device)
+        attention_mask = d['attention_mask'].to(device)
+        targets = d['targets'].to(device)
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        loss = loss_fn(outputs, targets)
+        losses.append(loss.item())
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+        optimizer.zero_grad()
+    return np.mean(losses)
+def train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs, batch_size, max_len):
+    train_dataset = EssayDataset(
+        texts=train_data['full_text'].to_numpy(),
+        targets=train_data[DIMENSIONS].to_numpy(),
+        tokenizer=tokenizer,
+        max_len=max_len
+    )
+    val_dataset = EssayDataset(
+        texts=val_data['full_text'].to_numpy(),
+        targets=val_data[DIMENSIONS].to_numpy(),
+        tokenizer=tokenizer,
+        max_len=max_len
+    )
+    train_data_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True
+    )
+    val_data_loader = DataLoader(
+        val_dataset,
+        batch_size=batch_size,
+        shuffle=False
+    )
+    loss_fn = nn.MSELoss().to(device)
+    for epoch in range(epochs):
+        print(f'Epoch {epoch + 1}/{epochs}')
+        print('-' * 10)
+        train_loss = train_epoch(
+            model,
+            train_data_loader,
+            loss_fn,
+            optimizer,
+            device,
+            scheduler,
+            len(train_dataset)
+        )
+        print(f'Train loss {train_loss}')
+if __name__ == "__main__":
+    df = pd.read_csv('train.csv')
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = EssayScoreRegressor(n_outputs=len(DIMENSIONS))
+    model = model.to(device)
+    optimizer = optim.Adam(model.parameters(), lr=2e-5)
+    total_steps = len(df) // 16 * 5
+    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1)
+    train_data = df.sample(frac=0.8, random_state=42)
+    val_data = df.drop(train_data.index)
+    train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs=5, batch_size=16, max_len=160)
+* eval.py
+import sys
+import os
+import pandas as pd
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from transformers import BertTokenizer
+from importlib import reload
+import train
+# Constants
+DIMENSIONS = train.DIMENSIONS
+class EssayDataset(Dataset):
+    def __init__(self, texts, targets, tokenizer, max_len):
+        self.texts = texts
+        self.targets = targets
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, item):
+        text = self.texts[item]
+        target = self.targets[item]
+        encoding = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            return_token_type_ids=False,
+            padding='max_length',
+            return_attention_mask=True,
+            return_tensors='pt',
+            truncation=True
+        )
+        return {
+            'text': text,
+            'input_ids': encoding['input_ids'].flatten(),
+            'attention_mask': encoding['attention_mask'].flatten(),
+            'targets': torch.tensor(target, dtype=torch.float)
+        }
+def get_score(submission_folder="../env"):
+    submission_path = os.path.join(submission_folder, "submission.csv")
+    solution = pd.read_csv(os.path.join(os.path.dirname(__file__), "answer.csv"))[DIMENSIONS].to_numpy()
+    submission = pd.read_csv(submission_path)[DIMENSIONS].to_numpy()
+    metrics = train.compute_metrics_for_regression(solution, submission)
+    return np.mean(list(metrics.values()))
+def eval_model(model, data_loader, device, n_examples):
+    model = model.eval()
+    predictions = []
+    with torch.no_grad():
+        for d in data_loader:
+            input_ids = d['input_ids'].to(device)
+            attention_mask = d['attention_mask'].to(device)
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            predictions.extend(outputs.cpu().numpy())
+    return predictions
+if __name__ == "__main__":
+    reload(train)
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = train.EssayScoreRegressor(n_outputs=len(DIMENSIONS))
+    model.load_state_dict(torch.load('model.bin'))
+    model = model.to(device)
+    test_data = pd.read_csv('test.csv')
+    test_dataset = EssayDataset(
+        texts=test_data['full_text'].to_numpy(),
+        targets=np.zeros((len(test_data), len(DIMENSIONS))),  # Dummy targets
+        tokenizer=tokenizer,
+        max_len=160
+    )
+    test_data_loader = DataLoader(
+        test_dataset,
+        batch_size=16,
+        shuffle=False
+    )
+    predictions = eval_model(
+        model,
+        test_data_loader,
+        device,
+        len(test_dataset)
+    )
+    submission = pd.DataFrame(predictions, columns=DIMENSIONS)
+    submission['text_id'] = test_data['text_id']
+    submission.to_csv(os.path.join("../env", 'submission.csv'), index=False)
+    print(get_score())
+"""
+class SessionInfo:
+    def __init__(self):
+        self.coro_cache = {}
+        self.parser = create_parser()
+    def make_session(self, prompt, session_hash):
+        id = session_hash
+        llm_name='claude-3-5-sonnet-20240620'
+        fastllm_name='claude-3-haiku-20240307'
+        rawargs = [
+            '--research-problem', prompt,
+            '--log-dir', str(Path('logs', id)),
+            '--work-dir', str(Path('workspaces', id)),
+            '--llm-name', llm_name,
+            '--edit-script-llm-name', llm_name,
+            '--fast-llm-name', fastllm_name,
+        ]
+        args = self.parser.parse_args(rawargs)
+        # llm.FAST_MODEL = args.fast_llm_name
+        env = Environment(args)
+        # agent = ResearchAgent(args, env)
+        coro = agent.run(env)
+        self.coro_cache[id] = coro
+        return id
+    def get_response(self, human_input, session_hash):
+        coro_input = human_input
+        if session_hash not in self.coro_cache:
+            self.make_session(human_input, session_hash)
+            coro_input = None
+        try:
+            output = self.coro_cache[session_hash].send(coro_input)
+        except StopIteration:
+            output = None
+            del self.coro_cache[session_hash]
+        return output
+session_info = SessionInfo()
+def info_to_message(info):
+    msg = ""
+    for k, v in info.items():
+        if isinstance(v, dict):
+            tempv = v
+            v = ""
+            for k2, v2 in tempv.items():
+                v += f"{k2}:\n  {v2}\n"
+        v = User.indent_text(v, 2)
+        msg += '-' * 64
+        msg += '\n'
+        msg += f"{k}:\n{v}\n"
+    msg += "Please provide feedback based on the history, response entries, and observation, and questions: "
+    return msg
+def predict(message, history, request: gr.Request):
+    response = session_info.get_response(message, request.session_hash)
+    if response is None:
+        return "Agent is finished. Enter a new instruction."
+    return info_to_message(response)
+# Initialize the global step_index and history
+process_steps = [
+    {
+        "Action": "Inspect Script Lines (train.py)",
+        "Observation": (
+            "The train.py script imports necessary libraries (e.g., pandas, sklearn, torch). "
+            "Sets random seeds for reproducibility. Defines compute_metrics_for_regression function "
+            "to calculate RMSE for different dimensions. Placeholder functions train_model and "
+            "predict exist without implementations."
+        ),
+    },
+    {
+        "Action": "Execute Script (train.py)",
+        "Observation": (
+            "The script executed successfully. Generated embeddings using the BERT model. Completed "
+            "the training process without errors. Metrics calculation placeholders indicated areas needing implementation."
+        ),
+    },
+    {
+        "Action": "Edit Script (train.py)",
+        "Observation": (
+            "Edited train.py to separate data loading, model definition, training loop, and evaluation into distinct functions. "
+            "The edited train.py now has clearly defined functions"
+            "for data loading (load_data), model definition (build_model), "
+            "training (train_model), and evaluation (evaluate_model). Similarly, eval.py is reorganized to load the model and perform predictions efficiently."
+        ),
+    },
+    {
+        "Action": "Retrieve Model",
+        "Observation": "CNN and BiLSTM retrieved.",
+    },
+    {
+        "Action": "Execute Script (train.py)",
+        "Observation": (
+            "The model trained over the specified number of epochs. Training and validation loss values are recorded for each epoch, "
+            "the decrease in loss indicates improved model performance."
+        )
+    },
+    {
+        "Action": "Evaluation",
+        "Observation": predefined_observation,
+    }
+]
+# step_index = 0
+# def info_to_message(info):
+#     msg = "Agent Response:\n"
+#     for k, v in info.items():
+#         if isinstance(v, dict):
+#             tempv = v
+#             v = ""
+#             for k2, v2 in tempv.items():
+#                 v += f"{k2}:\n  {v2}\n"
+#         v = User.indent_text(v, 2)
+#         msg += '-' * 64
+#         msg += '\n'
+#         msg += f"{k}:\n{v}\n"
+#     msg += "Please provide feedback based on the history, response entries, and observation, and questions: "
+#     print(msg)
+#     return msg
+# def predict(message, history):
+#     global step_index  # Declare the use of global variable
+#     if step_index < len(process_steps):
+#         response_info = process_steps[step_index]
+#         response = info_to_message(response_info)  # Convert dictionary to formatted string
+#         step_index += 1
+#     else:
+#         response = "Agent Finished."
+#     return response, "N/A"  # Return the formatted string and clear input
+# Gradio Interface
+with gr.Blocks() as app:
+    gr.Markdown("# AI Research Assistant with Research Agent")
+    # Use state variables to store generated hypothesis and experiment plan
+    hypothesis_state = gr.State("")
+    experiment_plan_state = gr.State("")
+    # Phase 1: Research Idea Generation Tab
+    with gr.Tab("Phase 1: Research Idea Generation"):
+        gr.Markdown("### Extract Research Elements and Generate Research Ideas")
+        with gr.Row():
+            with gr.Column():
+                paper_text_input = gr.Textbox(value=predefined_paper_text, lines=10, label="Research Paper Text")
+                extract_button = gr.Button("Extract Research Elements")
+                with gr.Row():
+                    tasks_output = gr.Textbox(placeholder="Research task definition", label="Research Tasks", lines=2, interactive=False)
+                    gaps_output = gr.Textbox(placeholder="Research gaps of current works", label="Research Gaps", lines=2, interactive=False)
+                    keywords_output = gr.Textbox(placeholder="Paper keywords", label="Keywords", lines=2, interactive=False)
+                    recent_works_output = gr.Textbox(placeholder="Recent works extracted from Semantic Scholar", label="Recent Works", lines=2, interactive=False)
+            with gr.Column():
+                with gr.Row():  # Move the button to the top right
+                    generate_button = gr.Button("Generate Research Hypothesis & Experiment Plan")
+                with gr.Group():
+                    gr.Markdown("### Research Idea")
+                    with gr.Row():
+                        hypothesis_output = gr.Textbox(label="Generated Hypothesis", lines=45, interactive=False)
+                        experiment_plan_output = gr.Textbox(label="Generated Experiment Plan", lines=45, interactive=False)
+        # Step 1: Extract Research Elements
+        extract_button.click(
+            fn=extract_research_elements,
+            inputs=paper_text_input,
+            outputs=[tasks_output, gaps_output, keywords_output, recent_works_output]
+        )
+        # Step 2: Generate Research Hypothesis and Experiment Plan
+        def generate_and_store(tasks, gaps, keywords, recent_works):
+            hypothesis, experiment_plan = generate_research_idea_and_plan(tasks, gaps, keywords, recent_works)
+            return hypothesis, experiment_plan, hypothesis, experiment_plan
+        generate_button.click(
+            fn=generate_and_store,
+            inputs=[tasks_output, gaps_output, keywords_output, recent_works_output],
+            outputs=[hypothesis_output, experiment_plan_output, hypothesis_state, experiment_plan_state]
+        )
+    # Phase 2: Interactive Session Tab
+    with gr.Tab("Phase 2&3: Experiment implementation and execution"):
+        gr.Markdown("### Interact with the ExperimentAgent")
+        with gr.Row():
+            with gr.Column():
+                idea_input = gr.Textbox(label="Research Hypothesis", lines=30, interactive=False)
+                plan_input = gr.Textbox(label="Experiment Plan", lines=30, interactive=False)
+            with gr.Column():
+                execute_button = gr.Button("Start ExperimentAgent", elem_classes=["execute-btn"])
+                with gr.Group():
+                    gr.Markdown("### Implementation + Execution Log")
+                    log = gr.Textbox(label="Execution Log", lines=20, interactive=False)
+                    code_display = gr.Code(label="Implementation", language="python", interactive=False)
+            with gr.Column():
+                # chatbot = gr.ChatInterface(predict)
+                response = gr.Textbox(label = "ExperimentAgent Response",  lines=30, interactive=False)
+                feedback = gr.Textbox(placeholder="N/A", label = "User Feedback",  lines=3, interactive=True)
+                submit_button = gr.Button("Submit", elem_classes=["Submit-btn"])
+        def submit_feedback(user_feedback, history, previous_response):
+            global step_index
+            if_end = False
+            step_index += 1
+            msg = history
+            if step_index < len(process_steps):
+                msg += previous_response + "\nUser feedback:" + user_feedback +"\n\n"
+                response_info = process_steps[step_index]
+                response = info_to_message(response_info)  # Convert dictionary to formatted string
+                step_index += 1
+            else:
+                if_end = True
+                response = "Agent Finished."
+            msg += response
+            return msg, response, predefined_code if if_end else final_code
+# def predict(message, history):
+#     global step_index  # Declare the use of global variable
+#     if step_index < len(process_steps):
+#         response_info = process_steps[step_index]
+#         response = info_to_message(response_info)  # Convert dictionary to formatted string
+#         step_index += 1
+#     else:
+#         response = "Agent Finished."
+        # Automatically populate the hypothesis and plan in Phase 2
+        def load_phase_2_inputs(hypothesis, plan):
+            return hypothesis, plan, "# Code implementation will be displayed here after Start ExperimentAgent."
+        # Function to implement and execute with the research agent
+        def implement_and_execute(hypothesis, plan):
+            predefined_message = f"Implement the following hypothesis and experiment plan:\n\nHypothesis:\n{hypothesis}\n\nExperiment Plan:\n{plan}"
+            return predefined_code, predefined_action_log
+        hypothesis_state.change(
+            fn=load_phase_2_inputs,
+            inputs=[hypothesis_state, experiment_plan_state],
+            outputs=[idea_input, plan_input, code_display]
+        )
+        # Trigger the research agent execution with the predefined hypothesis and plan
+        execute_button.click(
+            fn=implement_and_execute,
+            inputs=[hypothesis_state, experiment_plan_state],
+            outputs=[code_display, log]
+        )
+        submit_button.click(
+            fn=submit_feedback,
+            inputs=[feedback, log, response],
+            outputs=[log, response, code_display]
+        )
+if __name__ == "__main__":
+    # app.launch(share=True)
+    step_index = 0
+    app.launch()

benchmarks/CLRS/env/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================

benchmarks/CLRS/env/baseline_model_description.txt ADDED Viewed

	@@ -0,0 +1,507 @@

+The BaselineModel class in baselines.py file is a full working Graph Neural Network (GNN) example using JAX and the DeepMind JAX Ecosystem of libraries. It allows training of multiple algorithms on a single processor, as described in the paper "A Generalist Neural Algorithmic Learner" (arXiv:2209.11142v2 [cs.LG] 3 Dec 2022). Below is an excerpt from the paper that describes the model:
+Each algorithm in the CLRS benchmark [5] is specified by a number of inputs, hints and outputs. In
+a given sample, the inputs and outputs are fixed, while hints are time-series of intermediate states of
+the algorithm. Each sample for a particular task has a size, n, corresponding to the number of nodes
+in the GNN that will execute the algorithm.
+A sample of every algorithm is represented as a graph, with each input, output and hint located in
+either the nodes, the edges, or the graph itself, and therefore has shape (excluding batch dimension,
+and, for hints, time dimension) n × f , n × n × f , or f , respectively, f being the dimensionality of
+the feature, which depends on its type. The CLRS benchmark defines five types of features: scalar,
+categorical, mask, mask_one and pointer, with their own encoding and decoding strategies and
+loss functions—e.g. a scalar type will be encoded and decoded directly by a single linear layer, and
+optimised using mean squared error.
+Base Model
+Encoder. We adopt the same encode-process-decode paradigm [33] presented with the CLRS
+benchmark [5]. At each time step, t, of a particular task τ (e.g. insertion sort), the task-based encoder
+fτ , consisting of a linear encoder for each input and hint, embeds inputs and the current hints as
+high-dimensional vectors. These embeddings of inputs and hints located in the nodes all have the
+same dimension and are added together; the same happens with hints and inputs located in edges,
+and in the graph. In our experiments we use the same dimension, h = 128, for node, edge and graph
+3
+A Generalist Neural Algorithmic Learner
+embeddings. Thus, at the
+step for a time-step t of the algorithm, we have a
+n end of the encoding
+o
+(t) (t)
+(t)
+single set of embeddings xi , eij , g
+, shapes n × h, n × n × h, and h, in the nodes, edges and
+graph, respectively. Note that this is independent of the number and type of the inputs and hints of
+the particular algorithm, allowing us to share this latent space across all thirty algorithms in CLRS.
+Further, note that at each step, the input encoding is fed directly to these embeddings—this recall
+mechanism significantly improves the model’s robustness over long trajectories [34].
+Processor. The embeddings are fed into a processor P , a GNN that performs one step of computation. The processor transforms the input node, edge and graph embeddings into processed
+(t)
+node embeddings, hi . Additionally, the processor uses the processed node embeddings from the
+(t−1)
+previous step, hi
+, as inputs. Importantly, the same processor model can operate on graphs of any
+size. We leverage the message-passing neural network [35, MPNN], using the max aggregation and
+passing messages over a fully-connected graph, as our base model. The MPNN computes processed
+embeddings as follows:
+(t)
+(t−1)
+(t)
+(t) (t) (t)
+(t)
+(t)
+(t)
+z(t) = xi khi
+mi = max fm zi , zj , eij , g(t)
+hi = fr zi , mi
+(1)
+1≤j≤n
+starting from h(0) = 0. Here k denotes concatenation, fm : R2h × R2h × Rh × Rh → Rh is the
+message function (for which we use a three-layer MLP with ReLU activations), and fr : R2h × Rh →
+Rh is the readout function (for which we use a linear layer with ReLU activation). The use of the max
+aggregator is well-motivated by prior work [5, 9], and we use the fully connected graph—letting the
+neighbours j range over all nodes (1 ≤ j ≤ n)—in order to allow the model to overcome situations
+(t)
+where the input graph structure may be suboptimal. Layer normalisation [36] is applied to hi before
+using them further. Further details on the MPNN processor may be found in Veličković et al. [5].
+Decoder. The processed embeddings are finally decoded with a task-based decoder gτ , to predict
+the hints for the next step, and the outputs at the final step. Akin to the encoder, the task-based decoder
+relies mainly on a linear decoder for each hint and output, along with a mechanism to compute
+pairwise node similarities when appropriate. Specifically, the pointer type decoder computes
+a score, sij , for each pair of nodes, and then chooses the pointer of node i by taking either the
+argmaxj sij or softmaxj sij (depending on whether a hard or soft prediction is used).
+Loss. The decoded hints and outputs are used to compute the loss during training, according to their
+type [5]. For each sample in a batch, the hint prediction losses are averaged across hints and time,
+and the output loss is averaged across outputs (most algorithms have a single output, though some
+have two outputs). The hint loss and output loss are added together. Besides, the hint predictions at
+each time step are fed back as inputs for the next step, except possibly at train time if teacher forcing
+is used (see Section 3.2.1).
+We train the model on samples with sizes n ≤ 16, and periodically evaluate them on in-distribution
+samples of size n = 16. Also, periodically, we evaluate the model with the best in-distribution
+evaluation score so far on OOD samples of size n = 64. In what follows, we will be reporting only
+these OOD evaluation scores. Full details of the model, training and evaluation hyperparameters can
+be found in Appendix A.
+3.2
+Model improvements
+As previously discussed, single-task improvements, especially in terms of learning stability, will
+empirically transfer well to multi-task algorithmic learning. We now describe, in a gradual manner,
+all the changes made to the model, which have lead to an absolute improvement of over 20% on
+average across all 30 tasks in CLRS.
+3.2.1
+Dataset and training
+Removing teacher forcing. At evaluation time, the model has no access to the step-by-step hints
+in the dataset, and has to rely on its own hint predictions. However, during training, it is sometimes
+advisable to stabilise the trajectories with teacher forcing [37]—providing the ground-truth hint
+values instead of the network’s own predictions. In the prior model [5], ground-truth hints were
+4
+A Generalist Neural Algorithmic Learner
+provided during training with probability 0.5, as, without teacher forcing, losses tended to grow
+unbounded along a trajectory when scalar hints were present, destabilising the training. In this
+work we incorporate several significant stabilising changes (described in future paragraphs), which
+allows us to remove teacher forcing altogether, aligning training with evaluation, and avoiding the
+network becoming overconfident in always expecting correct hint predictions. With teacher forcing,
+performance deteriorates significantly in sorting algorithms and Kruskal’s algorithm. Naïve String
+Matcher, on the other hand, improves with teacher forcing (see Appendix A, Figs. 7-9).
+Augmenting the training data. To prevent our model from over-fitting to the statistics of the fixed
+CLRS training dataset [5], we augmented the training data in three key ways, without breaking
+the intended size distribution shift. Firstly, we used the on-line samplers in CLRS to generate new
+training examples on the fly, rather than using a fixed dataset which is easier to overfit to. Secondly,
+we trained on examples of mixed sizes, n ≤ 16, rather than only 16, which helps the model anticipate
+for a diverse range of sizes, rather than overfitting to the specifics of size n = 16. Lastly, for graph
+algorithms, we varied the connectivity probability p of the input graphs (generated by the Erdős-Rényi
+model [38]); and for string matching algorithms, we varied the length of the pattern to be matched.
+These both serve to expose the model to different trajectory lengths; for example, in many graph
+algorithms, the amount of steps the algorithm should run for is related to the graph’s diameter, and
+varying the connection probability in the graph generation allows for varying the expected diameter.
+These changes considerably increase training data variability, compared to the original dataset in
+Veličković et al. [5]. We provide a more detailed step-by-step overview of the data generation process
+in Appendix A.
+Soft hint propagation. When predicted hints are fed back as inputs during training, gradients
+may or may not be allowed to flow through them. In previous work, only hints of the scalar type
+allowed gradients through, as all categoricals were post-processed from logits into the ground-truth
+format via argmax or thresholding before being fed back. Instead, in this work we use softmax
+for categorical, mask_one and pointer types, and the logistic sigmoid for mask types. Without
+these soft hints, performance in sorting algorithms degrades (similarly to the case of teacher forcing),
+as well as in Naïve String Matcher (Appendix A, Figs. 7-9).
+Static hint elimination. Eleven algorithms in CLRS3 specify a fixed ordering of the nodes, common
+to every sample, via a node pointer hint that does not ever change along the trajectories. Prediction of
+this hint is trivial (identity function), but poses a potential problem for OOD generalisation, since the
+model can overfit to the fixed training values. We therefore turned this fixed hint into an input for
+these 11 algorithms, eliminating the need for explicitly predicting it.
+Improving training stability with encoder initialisation and gradient clipping. The scalar
+hints have unbounded values, in principle, and are optimised using mean-squared error, hence their
+gradients can quickly grow with increasing prediction error. Further, the predicted scalar hints then
+get re-encoded at every step, which can rapidly amplify errors throughout the trajectory, leading to
+exploding signals (and consequently gradients), even before any training takes place.
+To rectify this issue, we use the Xavier initialisation [45], effectively reducing the initial weights for
+scalar hints whose input dimensionality is just 1. However, we reverted to using the default LeCun
+initialisation [46] elsewhere. This combination of initialisations proved important for the initial
+learning stability of our model over long trajectories. Relatedly, in preliminary experiments, we saw
+drastic improvements in learning stability, as well as significant increases in validation performance,
+with gradient clipping [47], which we subsequently employed in all experiments.
+3.2.2
+Encoders and decoders
+Randomised position scalar. Across all algorithms in the dataset, there exists a position scalar
+input which uniquely indexes the nodes, with values linearly spaced between 0 and 1 along the node
+index. To avoid overfitting to these linearly spaced values during training, we replaced them with
+random values, uniformly sampled in [0, 1], sorted to match the initial order implied by the linearly
+spaced values. The benefit of this change is notable in algorithms where it would be easy to overfit to
+3
+Binary Search, Minimum, Max Subarray [39], Matrix Chain Order, LCS Length, Optimal BST [40], Activity
+Selector [41], Task Scheduling [42], Naïve String Matcher, Knuth-Morris-Pratt [43] and Jarvis’ March [44].
+5
+A Generalist Neural Algorithmic Learner
+these positions, such as string matching. Namely, the model could learn to base all of its computations
+on the assumption that it will always be finding a m-character pattern inside an n-character string,
+even though at test time, m and n will increase fourfold.
+Permutation decoders and the Sinkhorn operator. Sorting algorithms (Insertion Sort, Bubble
+Sort, Heapsort [48] and Quicksort [49]) always output a permutation of the input nodes. In the CLRS
+benchmark, this permutation is encoded as a pointer where each node points to its predecessor in
+the sorted order (the first node points to itself); this is represented as a n × n matrix P where each
+row is a one-hot vector, such that element (i, j) is 1 if node i points to node j. As with all types of
+pointers, such permutation pointers can be predicted using a row-wise softmax on unconstrained
+decoder outputs (logits), trained with cross entropy (as in Veličković et al. [5]). However, this does
+not explicitly take advantage of the fact that the pointers encode a permutation, which the model
+has to learn instead. Our early experiments showed that the model was often failing to predict valid
+permutations OOD.
+Accordingly, we enforce a permutation inductive bias in the output decoder of sorting algorithms, as
+follows. First, we modify the output representation by rewiring the first node to point to the last one,
+turning P into a permutation matrix, i.e., a matrix whose rows and columns are one-hot vectors. We
+also augment the representation with a one-hot vector of size n that specifies the first node, so we do
+not lose this information; this vector is treated like a regular mask_one feature. Second, we predict the
+permutation matrix P from unconstrained decoder outputs Y by replacing the usual row-wise softmax
+with the Sinkhorn operator S [32, 50–53]. S projects an arbitrary square matrix Y into a doubly
+stochastic matrix S(Y) (a non-negative matrix whose rows and columns sum to 1), by exponentiating
+and repeatedly normalizing rows and columns so they sum to 1. Specifically, S is defined by:
+S 0 (Y) = exp(Y)
+S l (Y) = Tc (Tr (S l−1 (Y)))
+S(Y) = lim S l (Y),
+l→∞
+(2)
+where exp acts element-wise, and Tr and Tc denote row and column normalisation respectively.
+Although the Sinkhorn operator produces a doubly stochastic matrix rather than a permutation matrix,
+we can obtain a permutation matrix by introducing a temperature parameter, τ > 0, and taking
+P = limτ →0+ S(Y/τ ); as long as there are no ties in the elements of Y, P is guaranteed to be a
+permutation matrix [52, Theorem 1].
+In practice, we compute the Sinkhorn operator using a fixed number of iterations lmax . We use a
+smaller number of iterations lmax = 10 for training, to limit vanishing and exploding gradients, and
+lmax = 60 for evaluation. A fixed temperature τ = 0.1 was experimentally found to give a good
+balance between speed of convergence and tie-breaking. We also encode the fact that no node points
+to itself, that is, that all diagonal elements of P should be 0, by setting the diagonal elements of Y to
+−∞. To avoid ties, we follow Mena et al. [53], injecting Gumbel noise to the elements of Y prior to
+applying the Sinkhorn operator, during training only. Finally, we transform the predicted matrix P,
+and mask_one pointing to the first element, into the original pointer representation used by CLRS.
+3.2.3
+Processor networks
+Gating mechanisms. Many algorithms only require updating a few nodes at each time step, keeping
+the rest unchanged. However, the MPNN we use (Equation 1) is biased towards the opposite: it
+updates all hidden states in each step. Although it is theoretically possible for the network to keep the
+states unchanged, learning to do so is not easy. With this in mind, and motivated by its effectiveness
+in NDRs [54], we augment the network with an update gate, biased to be closed by default. We
+found that the gate stabilizes learning on many of the tasks, and increases the mean performance
+over all tasks on single-task training significantly. Surprisingly, however, we did not find gating to be
+advantageous in the multi-task case.
+To add gating to the MPNN model we produce a per-node gating vector from the same inputs that
+process the embeddings in Equation 1:
+(t)
+(t)
+(t)
+gi = fg zi , mi
+(3)
+where fg : R2h × Rh → Rh is the gating function, for which we use a two-layer MLP, with
+ReLU activation for the hidden layer and logistic sigmoid activation for the output. Importantly, the
+final layer bias of fg is initialized to a value of −3, which biases the network for not updating its
+6
+A Generalist Neural Algorithmic Learner
+Our model
+Previous SOTA [5]
+80
+60
+40
+Quickselect
+Heapsort
+Knuth-Morris-Pratt
+Strongly Conn. Comps.
+DFS
+Floyd-Warshall
+Quicksort
+Bubble Sort
+Optimal BST
+Find Max. Subarray
+Insertion Sort
+Binary Search
+LCS Length
+Naïve String Matcher
+MST Prim
+Topological Sort
+Task Scheduling
+MST Kruskal
+Articulation Points
+Jarvis' March
+Matrix Chain Order
+Bridges
+Graham Scan
+Dijkstra
+Activity Selector
+Bellman-Ford
+DAG Shortest Paths
+Segments Intersect
+0
+BFS
+20
+Minimum
+Average score [%]
+100
+Figure 2: The OOD performance in single-task experiments before and after the improvements
+presented in this paper, sorted in descending order of current performance. Error bars represent
+standard error of the mean across seeds (3 seeds for previous SOTA experiments, 10 seeds for current).
+The previous SOTA values are the best of MPNN, PGN and Memnet models (see Table 2).
+b (t) , are computed as follows:
+representations, unless necessary. The processed gated embeddings, h
+i
+b (t) = g(t)
+h
+i
+i
+and are used instead of
+(t)
+hi
+(t)
+(t)
+hi + (1 − gi )
+in the subsequent steps, replacing z
+(t−1)
+hi
+(t)
+(4)
+in Eq. 1 by z
+(t)
+=
+(t) b (t−1)
+xi kh
+.
+i
+Triplet reasoning. Several algorithms within CLRS-30 explicitly require edge-based reasoning—
+where edges store values, and update them based on other edges’ values. An example of this is the
+Floyd-Warshall algorithm [55], which computes all-pairs shortest paths in a weighted graph. The
+update rule for dij , its estimate for the best distance from node i to j, is dij = mink dik + dkj , which
+roughly says “the best way to get from i to j is to find the optimal mid-point k, travel from i to k, then
+from k to j”. Similar rules are pervasive across many CLRS-30 algorithms, especially in dynamic
+programming. Even though there are no node representations in the above update, all our processors
+are centered on passing messages between node representations hi .
+To rectify this situation, we augment our processor to perform message passing towards edges.
+Referring again to the update for dij , we note that the edge representations are updated by choosing
+an intermediate node, then aggregating over all possible choices. Accordingly, and as previously observed by Dudzik and Veličković [31], we introduce triplet reasoning: first, computing representations
+over triplets of nodes, then reducing over one node to obtain edge latents:
+tijk = ψt (hi , hj , hk , eij , eik , ekj , g)
+hij = φt (max tijk )
+(5)
+k
+Here, ψt is a triplet message function, mapping all relevant representations to a single vector for
+each triplet of nodes, and φt is an edge readout function, which transforms the aggregated triplets
+for each edge for later use. According to prior findings on the CLRS benchmark [5], we use the
+max aggregation to obtain edge representations. The computed hij vectors can then be used in any
+edge-based reasoning task, and empirically they are indeed significantly beneficial, even in tasks
+where we did not initially anticipate such benefits. One example is Kruskal’s minimum spanning tree
+algorithm [56], where we presume that access to triplet reasoning allowed the model to more easily
+sort the edges by weight, as it selects how to augment the spanning forest at each step.
+In order to keep the footprint of triplet embeddings as lightweight as possible, we compute only
+8-dimensional features in ψt . φt then upscales the aggregated edge features back to 128 dimensions,
+to make them compatible with the rest of the architecture. Our initial experimentation demonstrated
+that the output dimensionality of ψt did not significantly affect downstream performance. Note that
+computing triplet representations has been a useful approach in general GNN design [57]—however,
+it has predominantly been studied in the context of GNNs over constant input features. Our study is
+among the first to verify their utility over reasoning tasks with well-specified initial features.
+3.3
+Results
+By incorporating the changes described in the previous sections we arrived at a single model type,
+with a single set of hyper-parameters, that was trained to reach new state-of-the-art performance
+7
+A Generalist Neural Algorithmic Learner
+Table 1: Single-task OOD micro-F1 score of previous SOTA Memnet, MPNN and PGN [5] and our
+best model Triplet-GMPNN with all our improvements, after 10,000 training steps.
+Alg. Type
+Memnet [5]
+MPNN [5]
+PGN [5]
+Triplet-GMPNN (ours)
+Div. & C.
+DP
+Geometry
+Graphs
+Greedy
+Search
+Sorting
+Strings
+13.05% ± 0.14
+67.94% ± 8.20
+45.14% ± 11.95
+24.12% ± 5.30
+53.42% ± 20.82
+34.35% ± 21.67
+71.53% ± 1.41
+1.51% ± 0.46
+20.30% ± 0.85
+65.10% ± 6.44
+73.11% ± 17.19
+62.79% ± 8.75
+82.39% ± 3.01
+41.20% ± 19.87
+11.83% ± 2.78
+3.21% ± 0.94
+65.23% ± 4.44
+70.58% ± 6.48
+61.19% ± 7.01
+60.25% ± 8.42
+75.84% ± 6.59
+56.11% ± 21.56
+15.45% ± 8.46
+2.04% ± 0.20
+76.36% ± 1.34
+81.99% ± 4.98
+94.09% ± 2.30
+81.41% ± 6.21
+91.21% ± 2.95
+58.61% ± 24.34
+60.37% ± 12.16
+49.09% ± 23.49
+38.88%
+44.99%
+50.84%
+74.14%
+0/30
+3/30
+10/30
+6/30
+9/30
+14/30
+3/30
+7/30
+15/30
+11/30
+17/30
+24/30
+Overall avg.
+> 90%
+> 80%
+> 60%
+on CLRS-30 [5]. Tables 1 and 2 show the micro-F1 scores of our model, which we refer to as
+Triplet-GMPNN (an MPNN with gating and triplet edge processing), over the original CLRS-30 test
+set (computed identically to Veličković et al. [5], but with 10 repetitions instead of 3). Our baselines
+include the Memnet [58], MPNN [35] and PGN [59] models, taken directly from Veličković et al. [5].
+Figure 2 displays the comparison between the improved model and the best model from Veličković
+et al. [5]. Our improvements lead to an overall average performance that is more than 20% higher
+(in absolute terms) compared to the next best model (see Table 1), and to a significant performance
+improvement in all but one algorithm family, compared to every other model. Further, our stabilising
+changes (such as gradient clipping) have empirically reduced the scale of our model’s gradient
+updates across the 30 tasks, preparing us better for the numerical issues of the multi-task regime. We
+finally also note that though we do not show it in Tables 1 & 2, applying the same improvements to
+the PGN processor, leads to an increase in overall performance from 50.84% (Table 1) to 69.31%.
+There are two notable examples of algorithm families with significant OOD performance improvement.
+The first are geometric algorithms (Segments Intersect, Graham Scan [60] and Jarvis’ March), now
+solved at approximately 94% OOD, compared to the previous best of about 73%; the second being
+string algorithms (Knuth-Morris-Pratt and Naïve String Matcher) for which our model now exceeds
+49% compared to the previous best of approximately 3%.
+The significant overall performance boost is reflected in the increased number of algorithms we can
+now solve at over 60%, 80% & 90% OOD performance, compared to previous SOTA [5]. Specifically,
+we now exceed 60% accuracy in 24 algorithms (15 algorithms previously), 80% for 17 algorithms (9
+previously) and 90% for 11 algorithms (6 previously).

benchmarks/CLRS/env/baselines.py ADDED Viewed

	@@ -0,0 +1,794 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""JAX implementation of CLRS baseline models."""
+import functools
+import os
+import pickle
+from typing import Dict, List, Optional, Tuple, Union
+import chex
+from clrs._src import decoders
+from clrs._src import losses
+from clrs._src import model
+from clrs._src import nets
+from clrs._src import probing
+from clrs._src import processors
+from clrs._src import samplers
+from clrs._src import specs
+import haiku as hk
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+_Array = chex.Array
+_DataPoint = probing.DataPoint
+_Features = samplers.Features
+_FeaturesChunked = samplers.FeaturesChunked
+_Feedback = samplers.Feedback
+_Location = specs.Location
+_Seed = jnp.integer
+_Spec = specs.Spec
+_Stage = specs.Stage
+_Trajectory = samplers.Trajectory
+_Type = specs.Type
+_OutputClass = specs.OutputClass
+# pytype: disable=signature-mismatch
+def _maybe_pick_first_pmapped(tree):
+  if jax.local_device_count() == 1:
+    return tree
+  return jax.tree_util.tree_map(lambda x: x[0], tree)
+@jax.jit
+def _restack_from_pmap(tree):
+  """Stack the results of a pmapped computation across the first two axes."""
+  restack_array = lambda x: jnp.reshape(x, (-1,) + x.shape[2:])
+  return jax.tree_util.tree_map(restack_array, tree)
+def _maybe_restack_from_pmap(tree):
+  if jax.local_device_count() == 1:
+    return tree
+  return _restack_from_pmap(tree)
+@functools.partial(jax.jit, static_argnums=[1, 2])
+def _pmap_reshape(x, n_devices, split_axis=0):
+  """Splits a pytree over n_devices on axis split_axis for pmapping."""
+  def _reshape(arr):
+    new_shape = (arr.shape[:split_axis] +
+                 (n_devices, arr.shape[split_axis] // n_devices) +
+                 arr.shape[split_axis + 1:])
+    return jnp.moveaxis(jnp.reshape(arr, new_shape), split_axis, 0)
+  return jax.tree_util.tree_map(_reshape, x)
+def _maybe_pmap_reshape(x, split_axis=0):
+  n_devices = jax.local_device_count()
+  if n_devices == 1:
+    return x
+  return _pmap_reshape(x, n_devices, split_axis)
+@functools.partial(jax.jit, static_argnums=1)
+def _pmap_data(data: Union[_Feedback, _Features], n_devices: int):
+  """Replicate/split feedback or features for pmapping."""
+  if isinstance(data, _Feedback):
+    features = data.features
+  else:
+    features = data
+  pmap_data = features._replace(
+      inputs=_pmap_reshape(features.inputs, n_devices),
+      hints=_pmap_reshape(features.hints, n_devices, split_axis=1),
+      lengths=_pmap_reshape(features.lengths, n_devices),
+  )
+  if isinstance(data, _Feedback):
+    pmap_data = data._replace(
+        features=pmap_data,
+        outputs=_pmap_reshape(data.outputs, n_devices)
+    )
+  return pmap_data
+def _maybe_pmap_data(data: Union[_Feedback, _Features]):
+  n_devices = jax.local_device_count()
+  if n_devices == 1:
+    return data
+  return _pmap_data(data, n_devices)
+def _maybe_put_replicated(tree):
+  if jax.local_device_count() == 1:
+    return jax.device_put(tree)
+  else:
+    return jax.device_put_replicated(tree, jax.local_devices())
+def _maybe_pmap_rng_key(rng_key: _Array):
+  n_devices = jax.local_device_count()
+  if n_devices == 1:
+    return rng_key
+  pmap_rng_keys = jax.random.split(rng_key, n_devices)
+  return jax.device_put_sharded(list(pmap_rng_keys), jax.local_devices())
+class BaselineModel(model.Model):
+  """Model implementation with selectable message passing algorithm."""
+  def __init__(
+      self,
+      spec: Union[_Spec, List[_Spec]],
+      dummy_trajectory: Union[List[_Feedback], _Feedback],
+      processor_factory: processors.ProcessorFactory,
+      hidden_dim: int = 32,
+      encode_hints: bool = False,
+      decode_hints: bool = True,
+      encoder_init: str = 'default',
+      use_lstm: bool = False,
+      learning_rate: float = 0.005,
+      grad_clip_max_norm: float = 0.0,
+      checkpoint_path: str = '/tmp/clrs3',
+      freeze_processor: bool = False,
+      dropout_prob: float = 0.0,
+      hint_teacher_forcing: float = 0.0,
+      hint_repred_mode: str = 'soft',
+      name: str = 'base_model',
+      nb_msg_passing_steps: int = 1,
+  ):
+    """Constructor for BaselineModel.
+    The model consists of encoders, processor and decoders. It can train
+    and evaluate either a single algorithm or a set of algorithms; in the
+    latter case, a single processor is shared among all the algorithms, while
+    the encoders and decoders are separate for each algorithm.
+    Args:
+      spec: Either a single spec for one algorithm, or a list of specs for
+        multiple algorithms to be trained and evaluated.
+      dummy_trajectory: Either a single feedback batch, in the single-algorithm
+        case, or a list of feedback batches, in the multi-algorithm case, that
+        comply with the `spec` (or list of specs), to initialize network size.
+      processor_factory: A callable that takes an `out_size` parameter
+        and returns a processor (see `processors.py`).
+      hidden_dim: Size of the hidden state of the model, i.e., size of the
+        message-passing vectors.
+      encode_hints: Whether to provide hints as model inputs.
+      decode_hints: Whether to provide hints as model outputs.
+      encoder_init: The initialiser type to use for the encoders.
+      use_lstm: Whether to insert an LSTM after message passing.
+      learning_rate: Learning rate for training.
+      grad_clip_max_norm: if greater than 0, the maximum norm of the gradients.
+      checkpoint_path: Path for loading/saving checkpoints.
+      freeze_processor: If True, the processor weights will be frozen and
+        only encoders and decoders (and, if used, the lstm) will be trained.
+      dropout_prob: Dropout rate in the message-passing stage.
+      hint_teacher_forcing: Probability of using ground-truth hints instead
+        of predicted hints as inputs during training (only relevant if
+        `encode_hints`=True)
+      hint_repred_mode: How to process predicted hints when fed back as inputs.
+        Only meaningful when `encode_hints` and `decode_hints` are True.
+        Options are:
+          - 'soft', where we use softmaxes for categoricals, pointers
+              and mask_one, and sigmoids for masks. This will allow gradients
+              to flow through hints during training.
+          - 'hard', where we use argmax instead of softmax, and hard
+              thresholding of masks. No gradients will go through the hints
+              during training; even for scalar hints, which don't have any
+              kind of post-processing, gradients will be stopped.
+          - 'hard_on_eval', which is soft for training and hard for evaluation.
+      name: Model name.
+      nb_msg_passing_steps: Number of message passing steps per hint.
+    Raises:
+      ValueError: if `encode_hints=True` and `decode_hints=False`.
+    """
+    super(BaselineModel, self).__init__(spec=spec)
+    if encode_hints and not decode_hints:
+      raise ValueError('`encode_hints=True`, `decode_hints=False` is invalid.')
+    assert hint_repred_mode in ['soft', 'hard', 'hard_on_eval']
+    self.decode_hints = decode_hints
+    self.checkpoint_path = checkpoint_path
+    self.name = name
+    self._freeze_processor = freeze_processor
+    if grad_clip_max_norm != 0.0:
+      optax_chain = [optax.clip_by_global_norm(grad_clip_max_norm),
+                     optax.scale_by_adam(),
+                     optax.scale(-learning_rate)]
+      self.opt = optax.chain(*optax_chain)
+    else:
+      self.opt = optax.adam(learning_rate)
+    self.nb_msg_passing_steps = nb_msg_passing_steps
+    self.nb_dims = []
+    if isinstance(dummy_trajectory, _Feedback):
+      assert len(self._spec) == 1
+      dummy_trajectory = [dummy_trajectory]
+    for traj in dummy_trajectory:
+      nb_dims = {}
+      for inp in traj.features.inputs:
+        nb_dims[inp.name] = inp.data.shape[-1]
+      for hint in traj.features.hints:
+        nb_dims[hint.name] = hint.data.shape[-1]
+      for outp in traj.outputs:
+        nb_dims[outp.name] = outp.data.shape[-1]
+      self.nb_dims.append(nb_dims)
+    self._create_net_fns(hidden_dim, encode_hints, processor_factory, use_lstm,
+                         encoder_init, dropout_prob, hint_teacher_forcing,
+                         hint_repred_mode)
+    self._device_params = None
+    self._device_opt_state = None
+    self.opt_state_skeleton = None
+  def _create_net_fns(self, hidden_dim, encode_hints, processor_factory,
+                      use_lstm, encoder_init, dropout_prob,
+                      hint_teacher_forcing, hint_repred_mode):
+    def _use_net(*args, **kwargs):
+      return nets.Net(self._spec, hidden_dim, encode_hints, self.decode_hints,
+                      processor_factory, use_lstm, encoder_init,
+                      dropout_prob, hint_teacher_forcing,
+                      hint_repred_mode,
+                      self.nb_dims, self.nb_msg_passing_steps)(*args, **kwargs)
+    self.net_fn = hk.transform(_use_net)
+    pmap_args = dict(axis_name='batch', devices=jax.local_devices())
+    n_devices = jax.local_device_count()
+    func, static_arg, extra_args = (
+        (jax.jit, 'static_argnums', {}) if n_devices == 1 else
+        (jax.pmap, 'static_broadcasted_argnums', pmap_args))
+    pmean = functools.partial(jax.lax.pmean, axis_name='batch')
+    self._maybe_pmean = pmean if n_devices > 1 else lambda x: x
+    extra_args[static_arg] = 3
+    self.jitted_grad = func(self._compute_grad, **extra_args)
+    extra_args[static_arg] = 4
+    self.jitted_feedback = func(self._feedback, donate_argnums=[0, 3],
+                                **extra_args)
+    extra_args[static_arg] = [3, 4, 5]
+    self.jitted_predict = func(self._predict, **extra_args)
+    extra_args[static_arg] = [3, 4]
+    self.jitted_accum_opt_update = func(accum_opt_update, donate_argnums=[0, 2],
+                                        **extra_args)
+  def init(self, features: Union[_Features, List[_Features]], seed: _Seed):
+    if not isinstance(features, list):
+      assert len(self._spec) == 1
+      features = [features]
+    self.params = self.net_fn.init(jax.random.PRNGKey(seed), features, True,  # pytype: disable=wrong-arg-types  # jax-ndarray
+                                   algorithm_index=-1,
+                                   return_hints=False,
+                                   return_all_outputs=False)
+    self.opt_state = self.opt.init(self.params)
+    # We will use the optimizer state skeleton for traversal when we
+    # want to avoid updating the state of params of untrained algorithms.
+    self.opt_state_skeleton = self.opt.init(jnp.zeros(1))
+  @property
+  def params(self):
+    if self._device_params is None:
+      return None
+    return jax.device_get(_maybe_pick_first_pmapped(self._device_params))
+  @params.setter
+  def params(self, params):
+    self._device_params = _maybe_put_replicated(params)
+  @property
+  def opt_state(self):
+    if self._device_opt_state is None:
+      return None
+    return jax.device_get(_maybe_pick_first_pmapped(self._device_opt_state))
+  @opt_state.setter
+  def opt_state(self, opt_state):
+    self._device_opt_state = _maybe_put_replicated(opt_state)
+  def _compute_grad(self, params, rng_key, feedback, algorithm_index):
+    lss, grads = jax.value_and_grad(self._loss)(
+        params, rng_key, feedback, algorithm_index)
+    return self._maybe_pmean(lss), self._maybe_pmean(grads)
+  def _feedback(self, params, rng_key, feedback, opt_state, algorithm_index):
+    lss, grads = jax.value_and_grad(self._loss)(
+        params, rng_key, feedback, algorithm_index)
+    grads = self._maybe_pmean(grads)
+    params, opt_state = self._update_params(params, grads, opt_state,
+                                            algorithm_index)
+    lss = self._maybe_pmean(lss)
+    return lss, params, opt_state
+  def _predict(self, params, rng_key: hk.PRNGSequence, features: _Features,
+               algorithm_index: int, return_hints: bool,
+               return_all_outputs: bool):
+    outs, hint_preds = self.net_fn.apply(
+        params, rng_key, [features],
+        repred=True, algorithm_index=algorithm_index,
+        return_hints=return_hints,
+        return_all_outputs=return_all_outputs)
+    outs = decoders.postprocess(self._spec[algorithm_index],
+                                outs,
+                                sinkhorn_temperature=0.1,
+                                sinkhorn_steps=50,
+                                hard=True,
+                                )
+    return outs, hint_preds
+  def compute_grad(
+      self,
+      rng_key: hk.PRNGSequence,
+      feedback: _Feedback,
+      algorithm_index: Optional[int] = None,
+  ) -> Tuple[float, _Array]:
+    """Compute gradients."""
+    if algorithm_index is None:
+      assert len(self._spec) == 1
+      algorithm_index = 0
+    assert algorithm_index >= 0
+    # Calculate gradients.
+    rng_keys = _maybe_pmap_rng_key(rng_key)  # pytype: disable=wrong-arg-types  # numpy-scalars
+    feedback = _maybe_pmap_data(feedback)
+    loss, grads = self.jitted_grad(
+        self._device_params, rng_keys, feedback, algorithm_index)
+    loss = _maybe_pick_first_pmapped(loss)
+    grads = _maybe_pick_first_pmapped(grads)
+    return  loss, grads
+  def feedback(self, rng_key: hk.PRNGSequence, feedback: _Feedback,
+               algorithm_index=None) -> float:
+    if algorithm_index is None:
+      assert len(self._spec) == 1
+      algorithm_index = 0
+    # Calculate and apply gradients.
+    rng_keys = _maybe_pmap_rng_key(rng_key)  # pytype: disable=wrong-arg-types  # numpy-scalars
+    feedback = _maybe_pmap_data(feedback)
+    loss, self._device_params, self._device_opt_state = self.jitted_feedback(
+        self._device_params, rng_keys, feedback,
+        self._device_opt_state, algorithm_index)
+    loss = _maybe_pick_first_pmapped(loss)
+    return loss
+  def predict(self, rng_key: hk.PRNGSequence, features: _Features,
+              algorithm_index: Optional[int] = None,
+              return_hints: bool = False,
+              return_all_outputs: bool = False):
+    """Model inference step."""
+    if algorithm_index is None:
+      assert len(self._spec) == 1
+      algorithm_index = 0
+    rng_keys = _maybe_pmap_rng_key(rng_key)  # pytype: disable=wrong-arg-types  # numpy-scalars
+    features = _maybe_pmap_data(features)
+    return _maybe_restack_from_pmap(
+        self.jitted_predict(
+            self._device_params, rng_keys, features,
+            algorithm_index,
+            return_hints,
+            return_all_outputs))
+  def _loss(self, params, rng_key, feedback, algorithm_index):
+    """Calculates model loss f(feedback; params)."""
+    output_preds, hint_preds = self.net_fn.apply(
+        params, rng_key, [feedback.features],
+        repred=False,
+        algorithm_index=algorithm_index,
+        return_hints=True,
+        return_all_outputs=False)
+    nb_nodes = _nb_nodes(feedback, is_chunked=False)
+    lengths = feedback.features.lengths
+    total_loss = 0.0
+    # Calculate output loss.
+    for truth in feedback.outputs:
+      total_loss += losses.output_loss(
+          truth=truth,
+          pred=output_preds[truth.name],
+          nb_nodes=nb_nodes,
+      )
+    # Optionally accumulate hint losses.
+    if self.decode_hints:
+      for truth in feedback.features.hints:
+        total_loss += losses.hint_loss(
+            truth=truth,
+            preds=[x[truth.name] for x in hint_preds],
+            lengths=lengths,
+            nb_nodes=nb_nodes,
+        )
+    return total_loss
+  def _update_params(self, params, grads, opt_state, algorithm_index):
+    updates, opt_state = filter_null_grads(
+        grads, self.opt, opt_state, self.opt_state_skeleton, algorithm_index)
+    if self._freeze_processor:
+      params_subset = _filter_out_processor(params)
+      updates_subset = _filter_out_processor(updates)
+      assert len(params) > len(params_subset)
+      assert params_subset
+      new_params = optax.apply_updates(params_subset, updates_subset)
+      new_params = hk.data_structures.merge(params, new_params)
+    else:
+      new_params = optax.apply_updates(params, updates)
+    return new_params, opt_state
+  def update_model_params_accum(self, grads) -> None:
+    grads = _maybe_put_replicated(grads)
+    self._device_params, self._device_opt_state = self.jitted_accum_opt_update(
+        self._device_params, grads, self._device_opt_state, self.opt,
+        self._freeze_processor)
+  def verbose_loss(self, feedback: _Feedback, extra_info) -> Dict[str, _Array]:
+    """Gets verbose loss information."""
+    hint_preds = extra_info
+    nb_nodes = _nb_nodes(feedback, is_chunked=False)
+    lengths = feedback.features.lengths
+    losses_ = {}
+    # Optionally accumulate hint losses.
+    if self.decode_hints:
+      for truth in feedback.features.hints:
+        losses_.update(
+            losses.hint_loss(
+                truth=truth,
+                preds=[x[truth.name] for x in hint_preds],
+                lengths=lengths,
+                nb_nodes=nb_nodes,
+                verbose=True,
+            ))
+    return losses_
+  def restore_model(self, file_name: str, only_load_processor: bool = False):
+    """Restore model from `file_name`."""
+    path = os.path.join(self.checkpoint_path, file_name)
+    with open(path, 'rb') as f:
+      restored_state = pickle.load(f)
+      if only_load_processor:
+        restored_params = _filter_in_processor(restored_state['params'])
+      else:
+        restored_params = restored_state['params']
+      self.params = hk.data_structures.merge(self.params, restored_params)
+      self.opt_state = restored_state['opt_state']
+  def save_model(self, file_name: str):
+    """Save model (processor weights only) to `file_name`."""
+    os.makedirs(self.checkpoint_path, exist_ok=True)
+    to_save = {'params': self.params, 'opt_state': self.opt_state}
+    path = os.path.join(self.checkpoint_path, file_name)
+    with open(path, 'wb') as f:
+      pickle.dump(to_save, f)
+class BaselineModelChunked(BaselineModel):
+  """Model that processes time-chunked data.
+    Unlike `BaselineModel`, which processes full samples, `BaselineModelChunked`
+    processes fixed-timelength chunks of data. Each tensor of inputs and hints
+    has dimensions chunk_length x batch_size x ... The beginning of a new
+    sample withing the chunk is signalled by a tensor called `is_first` of
+    dimensions chunk_length x batch_size.
+    The chunked model is intended for training. For validation and test, use
+    `BaselineModel`.
+  """
+  mp_states: List[List[nets.MessagePassingStateChunked]]
+  init_mp_states: List[List[nets.MessagePassingStateChunked]]
+  def _create_net_fns(self, hidden_dim, encode_hints, processor_factory,
+                      use_lstm, encoder_init, dropout_prob,
+                      hint_teacher_forcing, hint_repred_mode):
+    def _use_net(*args, **kwargs):
+      return nets.NetChunked(
+          self._spec, hidden_dim, encode_hints, self.decode_hints,
+          processor_factory, use_lstm, encoder_init, dropout_prob,
+          hint_teacher_forcing, hint_repred_mode,
+          self.nb_dims, self.nb_msg_passing_steps)(*args, **kwargs)
+    self.net_fn = hk.transform(_use_net)
+    pmap_args = dict(axis_name='batch', devices=jax.local_devices())
+    n_devices = jax.local_device_count()
+    func, static_arg, extra_args = (
+        (jax.jit, 'static_argnums', {}) if n_devices == 1 else
+        (jax.pmap, 'static_broadcasted_argnums', pmap_args))
+    pmean = functools.partial(jax.lax.pmean, axis_name='batch')
+    self._maybe_pmean = pmean if n_devices > 1 else lambda x: x
+    extra_args[static_arg] = 4
+    self.jitted_grad = func(self._compute_grad, **extra_args)
+    extra_args[static_arg] = 5
+    self.jitted_feedback = func(self._feedback, donate_argnums=[0, 4],
+                                **extra_args)
+    extra_args[static_arg] = [3, 4]
+    self.jitted_accum_opt_update = func(accum_opt_update, donate_argnums=[0, 2],
+                                        **extra_args)
+  def _init_mp_state(self, features_list: List[List[_FeaturesChunked]],
+                     rng_key: _Array):
+    def _empty_mp_state():
+      return nets.MessagePassingStateChunked(  # pytype: disable=wrong-arg-types  # numpy-scalars
+          inputs=None, hints=None, is_first=None,
+          hint_preds=None, hiddens=None, lstm_state=None)
+    empty_mp_states = [[_empty_mp_state() for _ in f] for f in features_list]
+    dummy_params = [self.net_fn.init(rng_key, f, e, False,
+                                     init_mp_state=True, algorithm_index=-1)
+                    for (f, e) in zip(features_list, empty_mp_states)]
+    mp_states = [
+        self.net_fn.apply(d, rng_key, f, e, False,
+                          init_mp_state=True, algorithm_index=-1)[1]
+        for (d, f, e) in zip(dummy_params, features_list, empty_mp_states)]
+    return mp_states
+  def init(self,
+           features: List[List[_FeaturesChunked]],
+           seed: _Seed):
+    self.mp_states = self._init_mp_state(features,
+                                         jax.random.PRNGKey(seed))  # pytype: disable=wrong-arg-types  # jax-ndarray
+    self.init_mp_states = [list(x) for x in self.mp_states]
+    self.params = self.net_fn.init(
+        jax.random.PRNGKey(seed), features[0], self.mp_states[0],  # pytype: disable=wrong-arg-types  # jax-ndarray
+        True, init_mp_state=False, algorithm_index=-1)
+    self.opt_state = self.opt.init(self.params)
+    # We will use the optimizer state skeleton for traversal when we
+    # want to avoid updating the state of params of untrained algorithms.
+    self.opt_state_skeleton = self.opt.init(jnp.zeros(1))
+  def predict(self, rng_key: hk.PRNGSequence, features: _FeaturesChunked,
+              algorithm_index: Optional[int] = None):
+    """Inference not implemented. Chunked model intended for training only."""
+    raise NotImplementedError
+  def _loss(self, params, rng_key, feedback, mp_state, algorithm_index):
+    (output_preds, hint_preds), mp_state = self.net_fn.apply(
+        params, rng_key, [feedback.features],
+        [mp_state],
+        repred=False,
+        init_mp_state=False,
+        algorithm_index=algorithm_index)
+    nb_nodes = _nb_nodes(feedback, is_chunked=True)
+    total_loss = 0.0
+    is_first = feedback.features.is_first
+    is_last = feedback.features.is_last
+    # Calculate output loss.
+    for truth in feedback.outputs:
+      total_loss += losses.output_loss_chunked(
+          truth=truth,
+          pred=output_preds[truth.name],
+          is_last=is_last,
+          nb_nodes=nb_nodes,
+      )
+    # Optionally accumulate hint losses.
+    if self.decode_hints:
+      for truth in feedback.features.hints:
+        loss = losses.hint_loss_chunked(
+            truth=truth,
+            pred=hint_preds[truth.name],
+            is_first=is_first,
+            nb_nodes=nb_nodes,
+        )
+        total_loss += loss
+    return total_loss, (mp_state,)
+  def _compute_grad(self, params, rng_key, feedback, mp_state, algorithm_index):
+    (lss, (mp_state,)), grads = jax.value_and_grad(self._loss, has_aux=True)(
+        params, rng_key, feedback, mp_state, algorithm_index)
+    return self._maybe_pmean(lss), mp_state, self._maybe_pmean(grads)
+  def _feedback(self, params, rng_key, feedback, mp_state, opt_state,
+                algorithm_index):
+    (lss, (mp_state,)), grads = jax.value_and_grad(self._loss, has_aux=True)(
+        params, rng_key, feedback, mp_state, algorithm_index)
+    grads = self._maybe_pmean(grads)
+    params, opt_state = self._update_params(params, grads, opt_state,
+                                            algorithm_index)
+    lss = self._maybe_pmean(lss)
+    return lss, params, opt_state, mp_state
+  def compute_grad(
+      self,
+      rng_key: hk.PRNGSequence,
+      feedback: _Feedback,
+      algorithm_index: Optional[Tuple[int, int]] = None,
+  ) -> Tuple[float, _Array]:
+    """Compute gradients."""
+    if algorithm_index is None:
+      assert len(self._spec) == 1
+      algorithm_index = (0, 0)
+    length_index, algorithm_index = algorithm_index
+    # Reusing init_mp_state improves performance.
+    # The next, commented out line, should be used for proper state keeping.
+    # mp_state = self.mp_states[length_index][algorithm_index]
+    mp_state = self.init_mp_states[length_index][algorithm_index]
+    rng_keys = _maybe_pmap_rng_key(rng_key)  # pytype: disable=wrong-arg-types  # numpy-scalars
+    feedback = _maybe_pmap_reshape(feedback, split_axis=1)
+    mp_state = _maybe_pmap_reshape(mp_state, split_axis=0)
+    loss, mp_state, grads = self.jitted_grad(
+        self._device_params, rng_keys, feedback, mp_state, algorithm_index)
+    loss = _maybe_pick_first_pmapped(loss)
+    grads = _maybe_pick_first_pmapped(grads)
+    mp_state = _maybe_restack_from_pmap(mp_state)
+    self.mp_states[length_index][algorithm_index] = mp_state
+    return loss, grads
+  def feedback(self, rng_key: hk.PRNGSequence, feedback: _Feedback,
+               algorithm_index=None) -> float:
+    if algorithm_index is None:
+      assert len(self._spec) == 1
+      algorithm_index = (0, 0)
+    length_index, algorithm_index = algorithm_index
+    # Reusing init_mp_state improves performance.
+    # The next, commented out line, should be used for proper state keeping.
+    # mp_state = self.mp_states[length_index][algorithm_index]
+    mp_state = self.init_mp_states[length_index][algorithm_index]
+    rng_keys = _maybe_pmap_rng_key(rng_key)  # pytype: disable=wrong-arg-types  # numpy-scalars
+    feedback = _maybe_pmap_reshape(feedback, split_axis=1)
+    mp_state = _maybe_pmap_reshape(mp_state, split_axis=0)
+    loss, self._device_params, self._device_opt_state, mp_state = (
+        self.jitted_feedback(
+            self._device_params, rng_keys, feedback,
+            mp_state, self._device_opt_state, algorithm_index))
+    loss = _maybe_pick_first_pmapped(loss)
+    mp_state = _maybe_restack_from_pmap(mp_state)
+    self.mp_states[length_index][algorithm_index] = mp_state
+    return loss
+  def verbose_loss(self, *args, **kwargs):
+    raise NotImplementedError
+def _nb_nodes(feedback: _Feedback, is_chunked) -> int:
+  for inp in feedback.features.inputs:
+    if inp.location in [_Location.NODE, _Location.EDGE]:
+      if is_chunked:
+        return inp.data.shape[2]  # inputs are time x batch x nodes x ...
+      else:
+        return inp.data.shape[1]  # inputs are batch x nodes x ...
+  assert False
+def _param_in_processor(module_name):
+  return processors.PROCESSOR_TAG in module_name
+def _filter_out_processor(params: hk.Params) -> hk.Params:
+  return hk.data_structures.filter(
+      lambda module_name, n, v: not _param_in_processor(module_name), params)
+def _filter_in_processor(params: hk.Params) -> hk.Params:
+  return hk.data_structures.filter(
+      lambda module_name, n, v: _param_in_processor(module_name), params)
+def _is_not_done_broadcast(lengths, i, tensor):
+  is_not_done = (lengths > i + 1) * 1.0
+  while len(is_not_done.shape) < len(tensor.shape):
+    is_not_done = jnp.expand_dims(is_not_done, -1)
+  return is_not_done
+def accum_opt_update(params, grads, opt_state, opt, freeze_processor):
+  """Update params from gradients collected from several algorithms."""
+  # Average the gradients over all algos
+  grads = jax.tree_util.tree_map(
+      lambda *x: sum(x) / (sum([jnp.any(k) for k in x]) + 1e-12), *grads)
+  updates, opt_state = opt.update(grads, opt_state)
+  if freeze_processor:
+    params_subset = _filter_out_processor(params)
+    assert len(params) > len(params_subset)
+    assert params_subset
+    updates_subset = _filter_out_processor(updates)
+    new_params = optax.apply_updates(params_subset, updates_subset)
+    new_params = hk.data_structures.merge(params, new_params)
+  else:
+    new_params = optax.apply_updates(params, updates)
+  return new_params, opt_state
+@functools.partial(jax.jit, static_argnames=['opt'])
+def opt_update(opt, flat_grads, flat_opt_state):
+  return opt.update(flat_grads, flat_opt_state)
+def filter_null_grads(grads, opt, opt_state, opt_state_skeleton, algo_idx):
+  """Compute updates ignoring params that have no gradients.
+  This prevents untrained params (e.g., encoders/decoders for algorithms
+  that are not being trained) to accumulate, e.g., momentum from spurious
+  zero gradients.
+  Note: this works as intended for "per-parameter" optimizer state, such as
+    momentum. However, when the optimizer has some global state (such as the
+    step counts in Adam), the global state will be updated every time,
+    affecting also future updates of parameters that had null gradients in the
+    current step.
+  Args:
+    grads: Gradients for all parameters.
+    opt: Optax optimizer.
+    opt_state: Optimizer state.
+    opt_state_skeleton: A "skeleton" of optimizer state that has been
+      initialized with scalar parameters. This serves to traverse each parameter
+      of the otpimizer state during the opt state update.
+    algo_idx: Index of algorithm, to filter out unused encoders/decoders.
+      If None, no filtering happens.
+  Returns:
+    Updates and new optimizer state, where the parameters with null gradient
+      have not been taken into account.
+  """
+  def _keep_in_algo(k, v):
+    """Ignore params of encoders/decoders irrelevant for this algo."""
+    # Note: in shared pointer decoder modes, we should exclude shared params
+    #       for algos that do not have pointer outputs.
+    if ((processors.PROCESSOR_TAG in k) or
+        (f'algo_{algo_idx}_' in k)):
+      return v
+    return jax.tree_util.tree_map(lambda x: None, v)
+  if algo_idx is None:
+    masked_grads = grads
+  else:
+    masked_grads = {k: _keep_in_algo(k, v) for k, v in grads.items()}
+  flat_grads, treedef = jax.tree_util.tree_flatten(masked_grads)
+  flat_opt_state = jax.tree_util.tree_map(
+      lambda _, x: x  # pylint:disable=g-long-lambda
+      if isinstance(x, (np.ndarray, jax.Array))
+      else treedef.flatten_up_to(x),
+      opt_state_skeleton,
+      opt_state,
+  )
+  # Compute updates only for the params with gradient.
+  flat_updates, flat_opt_state = opt_update(opt, flat_grads, flat_opt_state)
+  def unflatten(flat, original):
+    """Restore tree structure, filling missing (None) leaves with original."""
+    if isinstance(flat, (np.ndarray, jax.Array)):
+      return flat
+    return jax.tree_util.tree_map(lambda x, y: x if y is None else y, original,
+                                  treedef.unflatten(flat))
+  # Restore the state and updates tree structure.
+  new_opt_state = jax.tree_util.tree_map(lambda _, x, y: unflatten(x, y),
+                                         opt_state_skeleton, flat_opt_state,
+                                         opt_state)
+  updates = unflatten(flat_updates,
+                      jax.tree_util.tree_map(lambda x: 0., grads))
+  return updates, new_opt_state

benchmarks/CLRS/env/baselines_test.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Copyright 2022 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for `baselines.py`."""
+import copy
+import functools
+from typing import Generator
+from absl.testing import absltest
+from absl.testing import parameterized
+import chex
+from clrs._src import baselines
+from clrs._src import dataset
+from clrs._src import probing
+from clrs._src import processors
+from clrs._src import samplers
+from clrs._src import specs
+import haiku as hk
+import jax
+import numpy as np
+_Array = np.ndarray
+def _error(x, y):
+  return np.sum(np.abs(x-y))
+def _make_sampler(algo: str, length: int) -> samplers.Sampler:
+  sampler, _ = samplers.build_sampler(
+      algo,
+      seed=samplers.CLRS30['val']['seed'],
+      num_samples=samplers.CLRS30['val']['num_samples'],
+      length=length,
+  )
+  return sampler
+def _without_permutation(feedback):
+  """Replace should-be permutations with pointers."""
+  outputs = []
+  for x in feedback.outputs:
+    if x.type_ != specs.Type.SHOULD_BE_PERMUTATION:
+      outputs.append(x)
+      continue
+    assert x.location == specs.Location.NODE
+    outputs.append(probing.DataPoint(name=x.name, location=x.location,
+                                     type_=specs.Type.POINTER, data=x.data))
+  return feedback._replace(outputs=outputs)
+def _make_iterable_sampler(
+    algo: str, batch_size: int,
+    length: int) -> Generator[samplers.Feedback, None, None]:
+  sampler = _make_sampler(algo, length)
+  while True:
+    yield _without_permutation(sampler.next(batch_size))
+def _remove_permutation_from_spec(spec):
+  """Modify spec to turn permutation type to pointer."""
+  new_spec = {}
+  for k in spec:
+    if (spec[k][1] == specs.Location.NODE and
+        spec[k][2] == specs.Type.SHOULD_BE_PERMUTATION):
+      new_spec[k] = (spec[k][0], spec[k][1], specs.Type.POINTER)
+    else:
+      new_spec[k] = spec[k]
+  return new_spec
+class BaselinesTest(parameterized.TestCase):
+  def test_full_vs_chunked(self):
+    """Test that chunking does not affect gradients."""
+    batch_size = 4
+    length = 8
+    algo = 'insertion_sort'
+    spec = _remove_permutation_from_spec(specs.SPECS[algo])
+    rng_key = jax.random.PRNGKey(42)
+    full_ds = _make_iterable_sampler(algo, batch_size, length)
+    chunked_ds = dataset.chunkify(
+        _make_iterable_sampler(algo, batch_size, length),
+        length)
+    double_chunked_ds = dataset.chunkify(
+        _make_iterable_sampler(algo, batch_size, length),
+        length * 2)
+    full_batches = [next(full_ds) for _ in range(2)]
+    chunked_batches = [next(chunked_ds) for _ in range(2)]
+    double_chunk_batch = next(double_chunked_ds)
+    with chex.fake_jit():  # jitting makes test longer
+      processor_factory = processors.get_processor_factory(
+          'mpnn', use_ln=False, nb_triplet_fts=0)
+      common_args = dict(processor_factory=processor_factory, hidden_dim=8,
+                         learning_rate=0.01,
+                         decode_hints=True, encode_hints=True)
+      b_full = baselines.BaselineModel(
+          spec, dummy_trajectory=full_batches[0], **common_args)
+      b_full.init(full_batches[0].features, seed=42)  # pytype: disable=wrong-arg-types  # jax-ndarray
+      full_params = b_full.params
+      full_loss_0 = b_full.feedback(rng_key, full_batches[0])
+      b_full.params = full_params
+      full_loss_1 = b_full.feedback(rng_key, full_batches[1])
+      new_full_params = b_full.params
+      b_chunked = baselines.BaselineModelChunked(
+          spec, dummy_trajectory=chunked_batches[0], **common_args)
+      b_chunked.init([[chunked_batches[0].features]], seed=42)  # pytype: disable=wrong-arg-types  # jax-ndarray
+      chunked_params = b_chunked.params
+      jax.tree_util.tree_map(np.testing.assert_array_equal, full_params,
+                             chunked_params)
+      chunked_loss_0 = b_chunked.feedback(rng_key, chunked_batches[0])
+      b_chunked.params = chunked_params
+      chunked_loss_1 = b_chunked.feedback(rng_key, chunked_batches[1])
+      new_chunked_params = b_chunked.params
+      b_chunked.params = chunked_params
+      double_chunked_loss = b_chunked.feedback(rng_key, double_chunk_batch)
+    # Test that losses match
+    np.testing.assert_allclose(full_loss_0, chunked_loss_0, rtol=1e-4)
+    np.testing.assert_allclose(full_loss_1, chunked_loss_1, rtol=1e-4)
+    np.testing.assert_allclose(full_loss_0 + full_loss_1,
+                               2 * double_chunked_loss,
+                               rtol=1e-4)
+    # Test that gradients are the same (parameters changed equally).
+    # First check that gradients were not zero, i.e., parameters have changed.
+    param_change, _ = jax.tree_util.tree_flatten(
+        jax.tree_util.tree_map(_error, full_params, new_full_params))
+    self.assertGreater(np.mean(param_change), 0.1)
+    # Now check that full and chunked gradients are the same.
+    jax.tree_util.tree_map(
+        functools.partial(np.testing.assert_allclose, rtol=1e-4),
+        new_full_params, new_chunked_params)
+  def test_multi_vs_single(self):
+    """Test that multi = single when we only train one of the algorithms."""
+    batch_size = 4
+    length = 16
+    algos = ['insertion_sort', 'activity_selector', 'bfs']
+    spec = [_remove_permutation_from_spec(specs.SPECS[algo]) for algo in algos]
+    rng_key = jax.random.PRNGKey(42)
+    full_ds = [_make_iterable_sampler(algo, batch_size, length)
+               for algo in algos]
+    full_batches = [next(ds) for ds in full_ds]
+    full_batches_2 = [next(ds) for ds in full_ds]
+    with chex.fake_jit():  # jitting makes test longer
+      processor_factory = processors.get_processor_factory(
+          'mpnn', use_ln=False, nb_triplet_fts=0)
+      common_args = dict(processor_factory=processor_factory, hidden_dim=8,
+                         learning_rate=0.01,
+                         decode_hints=True, encode_hints=True)
+      b_single = baselines.BaselineModel(
+          spec[0], dummy_trajectory=full_batches[0], **common_args)
+      b_multi = baselines.BaselineModel(
+          spec, dummy_trajectory=full_batches, **common_args)
+      b_single.init(full_batches[0].features, seed=0)  # pytype: disable=wrong-arg-types  # jax-ndarray
+      b_multi.init([f.features for f in full_batches], seed=0)  # pytype: disable=wrong-arg-types  # jax-ndarray
+      single_params = []
+      single_losses = []
+      multi_params = []
+      multi_losses = []
+      single_params.append(copy.deepcopy(b_single.params))
+      single_losses.append(b_single.feedback(rng_key, full_batches[0]))
+      single_params.append(copy.deepcopy(b_single.params))
+      single_losses.append(b_single.feedback(rng_key, full_batches_2[0]))
+      single_params.append(copy.deepcopy(b_single.params))
+      multi_params.append(copy.deepcopy(b_multi.params))
+      multi_losses.append(b_multi.feedback(rng_key, full_batches[0],
+                                           algorithm_index=0))
+      multi_params.append(copy.deepcopy(b_multi.params))
+      multi_losses.append(b_multi.feedback(rng_key, full_batches_2[0],
+                                           algorithm_index=0))
+      multi_params.append(copy.deepcopy(b_multi.params))
+    # Test that losses match
+    np.testing.assert_array_equal(single_losses, multi_losses)
+    # Test that loss decreased
+    assert single_losses[1] < single_losses[0]
+    # Test that param changes were the same in single and multi-algorithm
+    for single, multi in zip(single_params, multi_params):
+      assert hk.data_structures.is_subset(subset=single, superset=multi)
+      for module_name, params in single.items():
+        jax.tree_util.tree_map(np.testing.assert_array_equal, params,
+                               multi[module_name])
+    # Test that params change for the trained algorithm, but not the others
+    for module_name, params in multi_params[0].items():
+      param_changes = jax.tree_util.tree_map(lambda a, b: np.sum(np.abs(a - b)),
+                                             params,
+                                             multi_params[1][module_name])
+      param_change = sum(param_changes.values())
+      if module_name in single_params[0]:  # params of trained algorithm
+        assert param_change > 1e-3
+      else:  # params of non-trained algorithms
+        assert param_change == 0.0
+  @parameterized.parameters(True, False)
+  def test_multi_algorithm_idx(self, is_chunked):
+    """Test that algorithm selection works as intended."""
+    batch_size = 4
+    length = 8
+    algos = ['insertion_sort', 'activity_selector', 'bfs']
+    spec = [_remove_permutation_from_spec(specs.SPECS[algo]) for algo in algos]
+    rng_key = jax.random.PRNGKey(42)
+    if is_chunked:
+      ds = [dataset.chunkify(_make_iterable_sampler(algo, batch_size, length),
+                             2 * length) for algo in algos]
+    else:
+      ds = [_make_iterable_sampler(algo, batch_size, length) for algo in algos]
+    batches = [next(d) for d in ds]
+    processor_factory = processors.get_processor_factory(
+        'mpnn', use_ln=False, nb_triplet_fts=0)
+    common_args = dict(processor_factory=processor_factory, hidden_dim=8,
+                       learning_rate=0.01,
+                       decode_hints=True, encode_hints=True)
+    if is_chunked:
+      baseline = baselines.BaselineModelChunked(
+          spec, dummy_trajectory=batches, **common_args)
+      baseline.init([[f.features for f in batches]], seed=0)  # pytype: disable=wrong-arg-types  # jax-ndarray
+    else:
+      baseline = baselines.BaselineModel(
+          spec, dummy_trajectory=batches, **common_args)
+      baseline.init([f.features for f in batches], seed=0)  # pytype: disable=wrong-arg-types  # jax-ndarray
+    # Find out what parameters change when we train each algorithm
+    def _change(x, y):
+      changes = {}
+      for module_name, params in x.items():
+        changes[module_name] = sum(
+            jax.tree_util.tree_map(
+                lambda a, b: np.sum(np.abs(a-b)), params, y[module_name]
+                ).values())
+      return changes
+    param_changes = []
+    for algo_idx in range(len(algos)):
+      init_params = copy.deepcopy(baseline.params)
+      _ = baseline.feedback(
+          rng_key,
+          batches[algo_idx],
+          algorithm_index=(0, algo_idx) if is_chunked else algo_idx)
+      param_changes.append(_change(init_params, baseline.params))
+    # Test that non-changing parameters correspond to encoders/decoders
+    # associated with the non-trained algorithms
+    unchanged = [[k for k in pc if pc[k] == 0] for pc in param_changes]
+    def _get_other_algos(algo_idx, modules):
+      return set([k for k in modules if '_construct_encoders_decoders' in k
+                  and f'algo_{algo_idx}' not in k])
+    for algo_idx in range(len(algos)):
+      expected_unchanged = _get_other_algos(algo_idx, baseline.params.keys())
+      self.assertNotEmpty(expected_unchanged)
+      self.assertSetEqual(expected_unchanged, set(unchanged[algo_idx]))
+if __name__ == '__main__':
+  absltest.main()

benchmarks/CLRS/env/data_description.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+The CLRS Algorithmic Reasoning Benchmark
+Learning representations of algorithms is an emerging area of machine learning, seeking to bridge concepts from neural networks with classical algorithms. The CLRS Algorithmic Reasoning Benchmark (CLRS) consolidates and extends previous work toward evaluation algorithmic reasoning by providing a suite of implementations of classical algorithms. These algorithms have been selected from the third edition of the standard Introduction to Algorithms by Cormen, Leiserson, Rivest and Stein.
+Algorithms as graphs
+CLRS implements the selected algorithms in an idiomatic way, which aligns as closely as possible to the original CLRS 3ed pseudocode. By controlling the input data distribution to conform to the preconditions we are able to automatically generate input/output pairs. We additionally provide trajectories of "hints" that expose the internal state of each algorithm, to both optionally simplify the learning challenge and to distinguish between different algorithms that solve the same overall task (e.g. sorting).
+In the most generic sense, algorithms can be seen as manipulating sets of objects, along with any relations between them (which can themselves be decomposed into binary relations). Accordingly, we study all of the algorithms in this benchmark using a graph representation. In the event that objects obey a more strict ordered structure (e.g. arrays or rooted trees), we impose this ordering through inclusion of predecessor links.
+How it works
+For each algorithm, we provide a canonical set of train, eval and test trajectories for benchmarking out-of-distribution generalization.
+Trajectories	Problem Size
+Train	1000	16
+Eval	32 x multiplier	16
+Test	32 x multiplier	64
+Here, "problem size" refers to e.g. the length of an array or number of nodes in a graph, depending on the algorithm. "multiplier" is an algorithm-specific factor that increases the number of available eval and test trajectories to compensate for paucity of evaluation signals. "multiplier" is 1 for all algorithms except:
+Maximum subarray (Kadane), for which "multiplier" is 32.
+Quick select, minimum, binary search, string matchers (both naive and KMP), and segment intersection, for which "multiplier" is 64.
+The trajectories can be used like so:
+train_ds, num_samples, spec = clrs.create_dataset(
+      folder='/tmp/CLRS30', algorithm='bfs',
+      split='train', batch_size=32)
+for i, feedback in enumerate(train_ds.as_numpy_iterator()):
+  if i == 0:
+    model.init(feedback.features, initial_seed)
+  loss = model.feedback(rng_key, feedback)
+Here, feedback is a namedtuple with the following structure:
+Feedback = collections.namedtuple('Feedback', ['features', 'outputs'])
+Features = collections.namedtuple('Features', ['inputs', 'hints', 'lengths'])
+where the content of Features can be used for training and outputs is reserved for evaluation. Each field of the tuple is an ndarray with a leading batch dimension. Because hints are provided for the full algorithm trajectory, these contain an additional time dimension padded up to the maximum length max(T) of any trajectory within the dataset. The lengths field specifies the true length t <= max(T) for each trajectory, which can be used e.g. for loss masking.

benchmarks/CLRS/env/dataset.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# Copyright 2022 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CLRS dataset."""
+import dataclasses
+import functools
+from typing import Iterator
+from clrs._src import probing
+from clrs._src import samplers
+from clrs._src import specs
+import jax
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+def _correct_axis_filtering(tensor, index, name):
+  if 'hint_' in name:
+    return tensor[:, index]
+  else:
+    return tensor[index]
+@dataclasses.dataclass
+class CLRSConfig(tfds.core.BuilderConfig):
+  """Specify the split in the variant because they have different shapes."""
+  split: str = ''
+DEFAULT_BUILDER_CONFIGS = []
+def _build_default_builder_configs():
+  for split in ['train', 'val', 'test']:
+    for alg in specs.CLRS_30_ALGS:
+      DEFAULT_BUILDER_CONFIGS.append(
+          CLRSConfig(name=f'{alg}_{split}', split=split))
+_build_default_builder_configs()
+class CLRSDataset(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for my_dataset dataset."""
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+  }
+  BUILDER_CONFIGS = DEFAULT_BUILDER_CONFIGS
+  _instantiated_dataset = None
+  _instantiated_dataset_name = ''
+  _instantiated_dataset_split = ''
+  def _num_samples(self, algorithm_name):
+    num_samples = samplers.CLRS30[self._builder_config.split]['num_samples']  # pytype: disable=attribute-error  # always-use-return-annotations
+    if self._builder_config.split != 'train':  # pytype: disable=attribute-error  # always-use-return-annotations
+      # Generate more samples for those algorithms in which the number of
+      # signals is small.
+      num_samples *= specs.CLRS_30_ALGS_SETTINGS[algorithm_name][
+          'num_samples_multiplier']
+    return num_samples
+  def _create_data(self, single_sample):
+    algorithm_name = '_'.join(self._builder_config.name.split('_')[:-1])
+    num_samples = self._num_samples(algorithm_name)
+    sampler, _ = samplers.build_sampler(
+        algorithm_name,
+        seed=samplers.CLRS30[self._builder_config.split]['seed'],  # pytype: disable=attribute-error  # always-use-return-annotations
+        num_samples=num_samples,
+        length=samplers.CLRS30[self._builder_config.split]['length'],  # pytype: disable=attribute-error  # always-use-return-annotations
+    )
+    sampled_dataset = sampler.next(batch_size=1 if single_sample else None)
+    data = {'input_' + t.name: t.data for t in sampled_dataset.features.inputs}
+    # All other data points have input_, hint_, and output_ prefixes, so we
+    # guarantee that this key is unused.
+    data['lengths'] = sampled_dataset.features.lengths
+    data.update({'output_' + t.name: t.data for t in sampled_dataset.outputs})
+    data.update({
+        'hint_' + t.name: t.data for t in sampled_dataset.features.hints})
+    self._instantiated_dataset = data
+  def _info(self) -> tfds.core.DatasetInfo:
+    if tf.io.gfile.exists(self.data_dir):
+      info = tfds.core.DatasetInfo(builder=self)
+      info.read_from_directory(self.data_dir)
+      return info
+    if (self._instantiated_dataset_name != self._builder_config.name
+        or self._instantiated_dataset_split != self._builder_config.split):  # pytype: disable=attribute-error  # always-use-return-annotations
+      self._create_data(single_sample=True)
+    data = {k: _correct_axis_filtering(v, 0, k)
+            for k, v in self._instantiated_dataset.items()}
+    data_info = {
+        k: tfds.features.Tensor(shape=v.shape, dtype=tf.dtypes.as_dtype(
+            v.dtype)) for k, v in data.items()}
+    return tfds.core.DatasetInfo(
+        builder=self,
+        features=tfds.features.FeaturesDict(data_info),
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Download the data and define splits."""
+    if (self._instantiated_dataset_name != self._builder_config.name
+        or self._instantiated_dataset_split != self._builder_config.split):  # pytype: disable=attribute-error  # always-use-return-annotations
+      self._create_data(single_sample=False)
+      self._instantiated_dataset_name = self._builder_config.name
+      self._instantiated_dataset_split = self._builder_config.split  # pytype: disable=attribute-error  # always-use-return-annotations
+    return {self._builder_config.split: self._generate_examples()}  # pytype: disable=attribute-error  # always-use-return-annotations
+  def _generate_examples(self):
+    """Generator of examples for each split."""
+    algorithm_name = '_'.join(self._builder_config.name.split('_')[:-1])
+    for i in range(self._num_samples(algorithm_name)):
+      data = {k: _correct_axis_filtering(v, i, k)
+              for k, v in self._instantiated_dataset.items()}
+      yield str(i), data
+def _get_clrs_file_name():
+  return f'CLRS30_v{CLRSDataset.VERSION}.tar.gz'
+def get_dataset_gcp_url():
+  return f'https://storage.googleapis.com/dm-clrs/{_get_clrs_file_name()}'
+def get_clrs_folder():
+  return f'CLRS30_v{CLRSDataset.VERSION}'
+def _preprocess(data_point, algorithm=None):
+  """Convert sampled inputs into DataPoints."""
+  inputs = []
+  outputs = []
+  hints = []
+  lengths = None
+  for name, data in data_point.items():
+    if name == 'lengths':
+      lengths = data
+      continue
+    data_point_name = name.split('_')
+    name = '_'.join(data_point_name[1:])
+    (stage, location, dp_type) = specs.SPECS[algorithm][name]
+    assert stage == data_point_name[0]
+    if stage == specs.Stage.HINT:
+      data = tf.experimental.numpy.swapaxes(data, 0, 1)
+    dp = probing.DataPoint(name, location, dp_type, data)
+    if stage == specs.Stage.INPUT:
+      inputs.append(dp)
+    elif stage == specs.Stage.OUTPUT:
+      outputs.append(dp)
+    else:
+      hints.append(dp)
+  return samplers.Feedback(
+      samplers.Features(tuple(inputs), tuple(hints), lengths), tuple(outputs))
+def create_dataset(folder, algorithm, split, batch_size):
+  dataset = tfds.load(f'clrs_dataset/{algorithm}_{split}',
+                      data_dir=folder, split=split)
+  num_samples = len(dataset)  # Must be done here for correct size
+  dataset = dataset.repeat()
+  dataset = dataset.batch(batch_size)
+  return (dataset.map(lambda d: _preprocess(d, algorithm=algorithm)),
+          num_samples,
+          specs.SPECS[algorithm])
+def _copy_hint(source, dest, i, start_source, start_dest, to_add):
+  """Copy from full-sample hint to a hint chunk."""
+  assert np.all(dest[start_dest:, i:] == 0)
+  assert start_dest < dest.shape[0]
+  assert start_dest + to_add <= dest.shape[0]
+  assert start_source < source.shape[0]
+  assert start_source + to_add <= source.shape[0]
+  dest[start_dest:start_dest+to_add, i] = source[
+      start_source:start_source+to_add, i]
+  return dest
+def _copy_io(source, dest, i, start_dest, to_add):
+  """Copy from an input or output to an input or output chunk."""
+  assert np.all(dest[start_dest:, i:] == 0)
+  dest[start_dest:start_dest+to_add, i] = source[i]
+  return dest
+def chunkify(dataset: Iterator[samplers.Feedback], chunk_length: int):
+  """Generator of fixed-length chunks from full-trajectory samples.
+  Args:
+    dataset: full-sample dataset as numpy iterator.
+    chunk_length: time length of chunks.
+  Yields:
+    Fixed-timelength chunks of data. Each tensor of inputs, hints and outputs
+    has dimensions chunk_length x batch_size x ... Samples are not time-padded,
+    after the end of one sample immediately comes the next. Since different
+    samples can have different time lengths, the beginnings and ends of samples
+    within a batch do not need to coincide. For this reason, the chunked
+    dataset features include two chunk_length x batch_size int tensors,
+    `is_first` and `is_last`, that mark the beginning and end of each sample.
+    For example, if `chunk_legnth`==6 and `batch_size`==2 and the first
+    full-sample batch had one sample of length 3 and one of length 5,
+    we would have a first chunked batch with the following `is_first` and
+    `is_last` tensors:
+    is_first = [[1, 1]    is_last = [[0, 0]     ( sample id [[0 1]
+                [0, 0]               [0, 0]                  [0 1]
+                [0, 0]               [1, 0]                  [0 1]
+                [1, 0]               [0, 0]                  [2 1]
+                [0, 0]               [0, 1]                  [2 1]
+                [0, 1]]              [0, 0]]                 [2 3]] )
+    while the data in the inputs, outputs and hints tensors would correspond
+    to samples as identified by the sample_id indicated above for reference.
+    Notice that, while in the full-sample dataset inputs and outputs have
+    no time dimension, here they do; the input and output tensors are simply
+    repeated along each sample's time length.
+  """
+  def _get_batch():
+    d = next(dataset)
+    return (d.features.inputs, d.features.hints, d.outputs,
+            d.features.lengths.astype(int))
+  inputs, hints, outputs, lengths = _get_batch()
+  for inp in inputs:
+    if inp.location in [specs.Location.NODE, specs.Location.EDGE]:
+      batch_size = inp.data.shape[0]
+      break
+  io_chunk = lambda x: np.zeros((chunk_length,) + x.shape, dtype=x.dtype)
+  chunk_inputs = jax.tree_util.tree_map(io_chunk, inputs)
+  chunk_outputs = jax.tree_util.tree_map(io_chunk, outputs)
+  hint_chunk = lambda x: np.zeros((chunk_length,) + x.shape[1:], dtype=x.dtype)
+  chunk_hints = jax.tree_util.tree_map(hint_chunk, hints)
+  inputs = [inputs]
+  hints = [hints]
+  outputs = [outputs]
+  left = [lengths.copy()]
+  lengths = [lengths.copy()]
+  while True:
+    # Create a new empty chunk
+    chunk_inputs = jax.tree_util.tree_map(np.zeros_like, chunk_inputs)
+    chunk_hints = jax.tree_util.tree_map(np.zeros_like, chunk_hints)
+    chunk_outputs = jax.tree_util.tree_map(np.zeros_like, chunk_outputs)
+    start_mark = np.zeros((chunk_length, batch_size), dtype=int)
+    end_mark = np.zeros((chunk_length, batch_size), dtype=int)
+    # Get enough data batches to fill the new chunk
+    while np.any(np.sum(left, axis=0) < chunk_length):
+      inp, hh, out, ll = _get_batch()
+      inputs.append(inp)
+      hints.append(hh)
+      outputs.append(out)
+      left.append(ll.copy())
+      lengths.append(ll.copy())
+    # Fill the chunk, one batch element at a time
+    for i in range(batch_size):
+      total, idx = 0, 0
+      while total < chunk_length:
+        to_add = min(left[idx][i], chunk_length - total)
+        if to_add:
+          start = lengths[idx][i] - left[idx][i]
+          assert start >= 0
+          f_io = functools.partial(_copy_io, i=i, start_dest=total,
+                                   to_add=to_add)
+          chunk_inputs = jax.tree_util.tree_map(f_io, inputs[idx], chunk_inputs)
+          chunk_outputs = jax.tree_util.tree_map(f_io, outputs[idx],
+                                                 chunk_outputs)
+          f_hint = functools.partial(_copy_hint, i=i, start_source=start,
+                                     start_dest=total, to_add=to_add)
+          chunk_hints = jax.tree_util.tree_map(f_hint, hints[idx], chunk_hints)
+          if start == 0:
+            start_mark[total, i] = 1
+          total += to_add
+          left[idx][i] -= to_add
+          assert left[idx][i] >= 0
+          if left[idx][i] == 0:
+            end_mark[total - 1, i] = 1
+        idx += 1
+      assert total == chunk_length
+    while left and np.all(left[0] == 0):
+      inputs.pop(0)
+      hints.pop(0)
+      outputs.pop(0)
+      left.pop(0)
+      lengths.pop(0)
+    yield samplers.Feedback(
+        samplers.FeaturesChunked(chunk_inputs, chunk_hints,
+                                 start_mark, end_mark),
+        chunk_outputs)
+def create_chunked_dataset(folder, algorithm, split, batch_size, chunk_length):
+  dataset = tfds.load(f'clrs_dataset/{algorithm}_{split}',
+                      data_dir=folder, split=split)
+  dataset = dataset.repeat()
+  dataset = dataset.batch(batch_size)
+  dataset = dataset.map(lambda d: _preprocess(d, algorithm=algorithm))
+  dataset = dataset.as_numpy_iterator()
+  return chunkify(dataset, chunk_length), specs.SPECS[algorithm]

benchmarks/CLRS/env/dataset_test.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright 2022 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for `dataset.py`."""
+from typing import Generator, List
+from absl.testing import absltest
+from absl.testing import parameterized
+from clrs._src import dataset
+from clrs._src import samplers
+from clrs._src import specs
+import numpy as np
+_Array = np.ndarray
+def _stack_to_shortest(x: List[_Array]) -> _Array:
+  min_len = min(map(len, x))
+  return np.array([a[:min_len] for a in x])
+def _make_sampler(algo: str) -> samplers.Sampler:
+  sampler, _ = samplers.build_sampler(
+      algo,
+      seed=samplers.CLRS30['val']['seed'],
+      num_samples=samplers.CLRS30['val']['num_samples'],
+      length=samplers.CLRS30['val']['length'],
+  )
+  return sampler
+def _make_iterable_sampler(
+    algo: str, batch_size: int) -> Generator[samplers.Feedback, None, None]:
+  sampler = _make_sampler(algo)
+  while True:
+    yield sampler.next(batch_size)
+class DatasetTest(parameterized.TestCase):
+  @parameterized.product(
+      name=specs.CLRS_30_ALGS[:5],
+      chunk_length=[20, 50])
+  def test_chunkify(self, name: str, chunk_length: int):
+    """Test that samples are concatenated and split in chunks correctly."""
+    batch_size = 8
+    ds = _make_iterable_sampler(name, batch_size)
+    chunked_ds = dataset.chunkify(
+        _make_iterable_sampler(name, batch_size),
+        chunk_length)
+    samples = [next(ds) for _ in range(20)]
+    cum_lengths = np.cumsum([s.features.lengths for s in samples], axis=0)
+    n_chunks = np.amax(cum_lengths[-1]).astype(int) // chunk_length + 1
+    chunks = [next(chunked_ds) for _ in range(n_chunks)]
+    # Check correctness of `is_first` and `is_last` markers
+    start_idx = _stack_to_shortest([np.where(x)[0] for x in np.concatenate(
+        [c.features.is_first for c in chunks]).T]).T
+    end_idx = _stack_to_shortest([np.where(x)[0] for x in np.concatenate(
+        [c.features.is_last for c in chunks]).T]).T
+    assert len(start_idx) >= len(cum_lengths)
+    start_idx = start_idx[:len(cum_lengths)]
+    assert len(end_idx) >= len(cum_lengths)
+    end_idx = end_idx[:len(cum_lengths)]
+    np.testing.assert_equal(start_idx[0], 0)
+    np.testing.assert_array_equal(cum_lengths - 1, end_idx)
+    np.testing.assert_array_equal(cum_lengths[:-1], start_idx[1:])
+    # Check that inputs, outputs and hints have been copied correctly
+    all_input = np.concatenate([c.features.inputs[0].data for c in chunks])
+    all_output = np.concatenate([c.outputs[0].data for c in chunks])
+    all_hint = np.concatenate([c.features.hints[0].data for c in chunks])
+    for i in range(batch_size):
+      length0 = int(samples[0].features.lengths[i])
+      length1 = int(samples[1].features.lengths[i])
+      # Check first sample
+      np.testing.assert_array_equal(
+          all_input[:length0, i],
+          np.tile(samples[0].features.inputs[0].data[i], [length0, 1]))
+      np.testing.assert_array_equal(
+          all_output[:length0, i],
+          np.tile(samples[0].outputs[0].data[i], [length0, 1]))
+      np.testing.assert_array_equal(
+          all_hint[:length0, i],
+          samples[0].features.hints[0].data[:length0, i])
+      # Check second sample
+      np.testing.assert_array_equal(
+          all_input[length0:length0 + length1, i],
+          np.tile(samples[1].features.inputs[0].data[i], [length1, 1]))
+      np.testing.assert_array_equal(
+          all_output[length0:length0 + length1, i],
+          np.tile(samples[1].outputs[0].data[i], [length1, 1]))
+      np.testing.assert_array_equal(
+          all_hint[length0:length0 + length1, i],
+          samples[1].features.hints[0].data[:length1, i])
+if __name__ == '__main__':
+  absltest.main()

benchmarks/CLRS/env/decoders.py ADDED Viewed

	@@ -0,0 +1,381 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""decoders utilities."""
+import functools
+from typing import Dict, Optional
+import chex
+from clrs._src import probing
+from clrs._src import specs
+import haiku as hk
+import jax
+import jax.numpy as jnp
+_Array = chex.Array
+_DataPoint = probing.DataPoint
+_Location = specs.Location
+_Spec = specs.Spec
+_Stage = specs.Stage
+_Type = specs.Type
+def log_sinkhorn(x: _Array, steps: int, temperature: float, zero_diagonal: bool,
+                 noise_rng_key: Optional[_Array]) -> _Array:
+  """Sinkhorn operator in log space, to postprocess permutation pointer logits.
+  Args:
+    x: input of shape [..., n, n], a batch of square matrices.
+    steps: number of iterations.
+    temperature: temperature parameter (as temperature approaches zero, the
+      output approaches a permutation matrix).
+    zero_diagonal: whether to force the diagonal logits towards -inf.
+    noise_rng_key: key to add Gumbel noise.
+  Returns:
+    Elementwise logarithm of a doubly-stochastic matrix (a matrix with
+    non-negative elements whose rows and columns sum to 1).
+  """
+  assert x.ndim >= 2
+  assert x.shape[-1] == x.shape[-2]
+  if noise_rng_key is not None:
+    # Add standard Gumbel noise (see https://arxiv.org/abs/1802.08665)
+    noise = -jnp.log(-jnp.log(jax.random.uniform(noise_rng_key,
+                                                 x.shape) + 1e-12) + 1e-12)
+    x = x + noise
+  x /= temperature
+  if zero_diagonal:
+    x = x - 1e6 * jnp.eye(x.shape[-1])
+  for _ in range(steps):
+    x = jax.nn.log_softmax(x, axis=-1)
+    x = jax.nn.log_softmax(x, axis=-2)
+  return x
+def construct_decoders(loc: str, t: str, hidden_dim: int, nb_dims: int,
+                       name: str):
+  """Constructs decoders."""
+  linear = functools.partial(hk.Linear, name=f"{name}_dec_linear")
+  if loc == _Location.NODE:
+    # Node decoders.
+    if t in [_Type.SCALAR, _Type.MASK, _Type.MASK_ONE]:
+      decoders = (linear(1),)
+    elif t == _Type.CATEGORICAL:
+      decoders = (linear(nb_dims),)
+    elif t in [_Type.POINTER, _Type.PERMUTATION_POINTER]:
+      decoders = (linear(hidden_dim), linear(hidden_dim), linear(hidden_dim),
+                  linear(1))
+    else:
+      raise ValueError(f"Invalid Type {t}")
+  elif loc == _Location.EDGE:
+    # Edge decoders.
+    if t in [_Type.SCALAR, _Type.MASK, _Type.MASK_ONE]:
+      decoders = (linear(1), linear(1), linear(1))
+    elif t == _Type.CATEGORICAL:
+      decoders = (linear(nb_dims), linear(nb_dims), linear(nb_dims))
+    elif t == _Type.POINTER:
+      decoders = (linear(hidden_dim), linear(hidden_dim),
+                  linear(hidden_dim), linear(hidden_dim), linear(1))
+    else:
+      raise ValueError(f"Invalid Type {t}")
+  elif loc == _Location.GRAPH:
+    # Graph decoders.
+    if t in [_Type.SCALAR, _Type.MASK, _Type.MASK_ONE]:
+      decoders = (linear(1), linear(1))
+    elif t == _Type.CATEGORICAL:
+      decoders = (linear(nb_dims), linear(nb_dims))
+    elif t == _Type.POINTER:
+      decoders = (linear(1), linear(1),
+                  linear(1))
+    else:
+      raise ValueError(f"Invalid Type {t}")
+  else:
+    raise ValueError(f"Invalid Location {loc}")
+  return decoders
+def construct_diff_decoders(name: str):
+  """Constructs diff decoders."""
+  linear = functools.partial(hk.Linear, name=f"{name}_diffdec_linear")
+  decoders = {}
+  decoders[_Location.NODE] = linear(1)
+  decoders[_Location.EDGE] = (linear(1), linear(1), linear(1))
+  decoders[_Location.GRAPH] = (linear(1), linear(1))
+  return decoders
+def postprocess(spec: _Spec, preds: Dict[str, _Array],
+                sinkhorn_temperature: float,
+                sinkhorn_steps: int,
+                hard: bool) -> Dict[str, _DataPoint]:
+  """Postprocesses decoder output.
+  This is done on outputs in order to score performance, and on hints in
+  order to score them but also in order to feed them back to the model.
+  At scoring time, the postprocessing mode is "hard", logits will be
+  arg-maxed and masks will be thresholded. However, for the case of the hints
+  that are fed back in the model, the postprocessing can be hard or soft,
+  depending on whether we want to let gradients flow through them or not.
+  Args:
+    spec: The spec of the algorithm whose outputs/hints we are postprocessing.
+    preds: Output and/or hint predictions, as produced by decoders.
+    sinkhorn_temperature: Parameter for the sinkhorn operator on permutation
+      pointers.
+    sinkhorn_steps: Parameter for the sinkhorn operator on permutation
+      pointers.
+    hard: whether to do hard postprocessing, which involves argmax for
+      MASK_ONE, CATEGORICAL and POINTERS, thresholding for MASK, and stop
+      gradient through for SCALAR. If False, soft postprocessing will be used,
+      with softmax, sigmoid and gradients allowed.
+  Returns:
+    The postprocessed `preds`. In "soft" post-processing, POINTER types will
+    change to SOFT_POINTER, so encoders know they do not need to be
+    pre-processed before feeding them back in.
+  """
+  result = {}
+  for name in preds.keys():
+    _, loc, t = spec[name]
+    new_t = t
+    data = preds[name]
+    if t == _Type.SCALAR:
+      if hard:
+        data = jax.lax.stop_gradient(data)
+    elif t == _Type.MASK:
+      if hard:
+        data = (data > 0.0) * 1.0
+      else:
+        data = jax.nn.sigmoid(data)
+    elif t in [_Type.MASK_ONE, _Type.CATEGORICAL]:
+      cat_size = data.shape[-1]
+      if hard:
+        best = jnp.argmax(data, -1)
+        data = hk.one_hot(best, cat_size)
+      else:
+        data = jax.nn.softmax(data, axis=-1)
+    elif t == _Type.POINTER:
+      if hard:
+        data = jnp.argmax(data, -1).astype(float)
+      else:
+        data = jax.nn.softmax(data, -1)
+        new_t = _Type.SOFT_POINTER
+    elif t == _Type.PERMUTATION_POINTER:
+      # Convert the matrix of logits to a doubly stochastic matrix.
+      data = log_sinkhorn(
+          x=data,
+          steps=sinkhorn_steps,
+          temperature=sinkhorn_temperature,
+          zero_diagonal=True,
+          noise_rng_key=None)
+      data = jnp.exp(data)
+      if hard:
+        data = jax.nn.one_hot(jnp.argmax(data, axis=-1), data.shape[-1])
+    else:
+      raise ValueError("Invalid type")
+    result[name] = probing.DataPoint(
+        name=name, location=loc, type_=new_t, data=data)
+  return result
+def decode_fts(
+    decoders,
+    spec: _Spec,
+    h_t: _Array,
+    adj_mat: _Array,
+    edge_fts: _Array,
+    graph_fts: _Array,
+    inf_bias: bool,
+    inf_bias_edge: bool,
+    repred: bool,
+):
+  """Decodes node, edge and graph features."""
+  output_preds = {}
+  hint_preds = {}
+  for name in decoders:
+    decoder = decoders[name]
+    stage, loc, t = spec[name]
+    if loc == _Location.NODE:
+      preds = _decode_node_fts(decoder, t, h_t, edge_fts, adj_mat,
+                               inf_bias, repred)
+    elif loc == _Location.EDGE:
+      preds = _decode_edge_fts(decoder, t, h_t, edge_fts, adj_mat,
+                               inf_bias_edge)
+    elif loc == _Location.GRAPH:
+      preds = _decode_graph_fts(decoder, t, h_t, graph_fts)
+    else:
+      raise ValueError("Invalid output type")
+    if stage == _Stage.OUTPUT:
+      output_preds[name] = preds
+    elif stage == _Stage.HINT:
+      hint_preds[name] = preds
+    else:
+      raise ValueError(f"Found unexpected decoder {name}")
+  return hint_preds, output_preds
+def _decode_node_fts(decoders, t: str, h_t: _Array, edge_fts: _Array,
+                     adj_mat: _Array, inf_bias: bool, repred: bool) -> _Array:
+  """Decodes node features."""
+  if t in [_Type.SCALAR, _Type.MASK, _Type.MASK_ONE]:
+    preds = jnp.squeeze(decoders[0](h_t), -1)
+  elif t == _Type.CATEGORICAL:
+    preds = decoders[0](h_t)
+  elif t in [_Type.POINTER, _Type.PERMUTATION_POINTER]:
+    p_1 = decoders[0](h_t)
+    p_2 = decoders[1](h_t)
+    p_3 = decoders[2](edge_fts)
+    p_e = jnp.expand_dims(p_2, -2) + p_3
+    p_m = jnp.maximum(jnp.expand_dims(p_1, -2),
+                      jnp.transpose(p_e, (0, 2, 1, 3)))
+    preds = jnp.squeeze(decoders[3](p_m), -1)
+    if inf_bias:
+      per_batch_min = jnp.min(preds, axis=range(1, preds.ndim), keepdims=True)
+      preds = jnp.where(adj_mat > 0.5,
+                        preds,
+                        jnp.minimum(-1.0, per_batch_min - 1.0))
+    if t == _Type.PERMUTATION_POINTER:
+      if repred:  # testing or validation, no Gumbel noise
+        preds = log_sinkhorn(
+            x=preds, steps=10, temperature=0.1,
+            zero_diagonal=True, noise_rng_key=None)
+      else:  # training, add Gumbel noise
+        preds = log_sinkhorn(
+            x=preds, steps=10, temperature=0.1,
+            zero_diagonal=True, noise_rng_key=hk.next_rng_key())
+  else:
+    raise ValueError("Invalid output type")
+  return preds
+def _decode_edge_fts(decoders, t: str, h_t: _Array, edge_fts: _Array,
+                     adj_mat: _Array, inf_bias_edge: bool) -> _Array:
+  """Decodes edge features."""
+  pred_1 = decoders[0](h_t)
+  pred_2 = decoders[1](h_t)
+  pred_e = decoders[2](edge_fts)
+  pred = (jnp.expand_dims(pred_1, -2) + jnp.expand_dims(pred_2, -3) + pred_e)
+  if t in [_Type.SCALAR, _Type.MASK, _Type.MASK_ONE]:
+    preds = jnp.squeeze(pred, -1)
+  elif t == _Type.CATEGORICAL:
+    preds = pred
+  elif t == _Type.POINTER:
+    pred_2 = decoders[3](h_t)
+    p_m = jnp.maximum(jnp.expand_dims(pred, -2),
+                      jnp.expand_dims(
+                          jnp.expand_dims(pred_2, -3), -3))
+    preds = jnp.squeeze(decoders[4](p_m), -1)
+  else:
+    raise ValueError("Invalid output type")
+  if inf_bias_edge and t in [_Type.MASK, _Type.MASK_ONE]:
+    per_batch_min = jnp.min(preds, axis=range(1, preds.ndim), keepdims=True)
+    preds = jnp.where(adj_mat > 0.5,
+                      preds,
+                      jnp.minimum(-1.0, per_batch_min - 1.0))
+  return preds
+def _decode_graph_fts(decoders, t: str, h_t: _Array,
+                      graph_fts: _Array) -> _Array:
+  """Decodes graph features."""
+  gr_emb = jnp.max(h_t, axis=-2)
+  pred_n = decoders[0](gr_emb)
+  pred_g = decoders[1](graph_fts)
+  pred = pred_n + pred_g
+  if t in [_Type.SCALAR, _Type.MASK, _Type.MASK_ONE]:
+    preds = jnp.squeeze(pred, -1)
+  elif t == _Type.CATEGORICAL:
+    preds = pred
+  elif t == _Type.POINTER:
+    pred_2 = decoders[2](h_t)
+    ptr_p = jnp.expand_dims(pred, 1) + jnp.transpose(pred_2, (0, 2, 1))
+    preds = jnp.squeeze(ptr_p, 1)
+  else:
+    raise ValueError("Invalid output type")
+  return preds
+def maybe_decode_diffs(
+    diff_decoders,
+    h_t: _Array,
+    edge_fts: _Array,
+    graph_fts: _Array,
+    decode_diffs: bool,
+) -> Optional[Dict[str, _Array]]:
+  """Optionally decodes node, edge and graph diffs."""
+  if decode_diffs:
+    preds = {}
+    node = _Location.NODE
+    edge = _Location.EDGE
+    graph = _Location.GRAPH
+    preds[node] = _decode_node_diffs(diff_decoders[node], h_t)
+    preds[edge] = _decode_edge_diffs(diff_decoders[edge], h_t, edge_fts)
+    preds[graph] = _decode_graph_diffs(diff_decoders[graph], h_t, graph_fts)
+  else:
+    preds = None
+  return preds
+def _decode_node_diffs(decoders, h_t: _Array) -> _Array:
+  """Decodes node diffs."""
+  return jnp.squeeze(decoders(h_t), -1)
+def _decode_edge_diffs(decoders, h_t: _Array, edge_fts: _Array) -> _Array:
+  """Decodes edge diffs."""
+  e_pred_1 = decoders[0](h_t)
+  e_pred_2 = decoders[1](h_t)
+  e_pred_e = decoders[2](edge_fts)
+  preds = jnp.squeeze(
+      jnp.expand_dims(e_pred_1, -1) + jnp.expand_dims(e_pred_2, -2) + e_pred_e,
+      -1,
+  )
+  return preds
+def _decode_graph_diffs(decoders, h_t: _Array, graph_fts: _Array) -> _Array:
+  """Decodes graph diffs."""
+  gr_emb = jnp.max(h_t, axis=-2)
+  g_pred_n = decoders[0](gr_emb)
+  g_pred_g = decoders[1](graph_fts)
+  preds = jnp.squeeze(g_pred_n + g_pred_g, -1)
+  return preds

benchmarks/CLRS/env/decoders_test.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright 2022 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for `decoders.py`."""
+from absl.testing import absltest
+import chex
+from clrs._src import decoders
+import jax
+import jax.numpy as jnp
+class DecodersTest(absltest.TestCase):
+  def test_log_sinkhorn(self):
+    x = jax.random.normal(jax.random.PRNGKey(42), (10, 10))
+    y = jnp.exp(decoders.log_sinkhorn(x, steps=10, temperature=1.0,
+                                      zero_diagonal=False,
+                                      noise_rng_key=None))
+    chex.assert_trees_all_close(jnp.sum(y, axis=-1), 1., atol=1e-4)
+    chex.assert_trees_all_close(jnp.sum(y, axis=-2), 1., atol=1e-4)
+  def test_log_sinkhorn_zero_diagonal(self):
+    x = jax.random.normal(jax.random.PRNGKey(42), (10, 10))
+    y = jnp.exp(decoders.log_sinkhorn(x, steps=10, temperature=1.0,
+                                      zero_diagonal=True,
+                                      noise_rng_key=None))
+    chex.assert_trees_all_close(jnp.sum(y, axis=-1), 1., atol=1e-4)
+    chex.assert_trees_all_close(jnp.sum(y, axis=-2), 1., atol=1e-4)
+    chex.assert_trees_all_close(jnp.sum(y.diagonal()), 0., atol=1e-4)
+if __name__ == '__main__':
+  absltest.main()

benchmarks/CLRS/env/encoders.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Encoder utilities."""
+import functools
+import chex
+from clrs._src import probing
+from clrs._src import specs
+import haiku as hk
+import jax.numpy as jnp
+_Array = chex.Array
+_DataPoint = probing.DataPoint
+_Location = specs.Location
+_Spec = specs.Spec
+_Stage = specs.Stage
+_Type = specs.Type
+def construct_encoders(stage: str, loc: str, t: str,
+                       hidden_dim: int, init: str, name: str):
+  """Constructs encoders."""
+  if init == 'xavier_on_scalars' and stage == _Stage.HINT and t == _Type.SCALAR:
+    initialiser = hk.initializers.TruncatedNormal(
+        stddev=1.0 / jnp.sqrt(hidden_dim))
+  elif init in ['default', 'xavier_on_scalars']:
+    initialiser = None
+  else:
+    raise ValueError(f'Encoder initialiser {init} not supported.')
+  linear = functools.partial(
+      hk.Linear,
+      w_init=initialiser,
+      name=f'{name}_enc_linear')
+  encoders = [linear(hidden_dim)]
+  if loc == _Location.EDGE and t == _Type.POINTER:
+    # Edge pointers need two-way encoders.
+    encoders.append(linear(hidden_dim))
+  return encoders
+def preprocess(dp: _DataPoint, nb_nodes: int) -> _DataPoint:
+  """Pre-process data point.
+  Make sure that the data is ready to be encoded into features.
+  If the data is of POINTER type, we expand the compressed index representation
+  to a full one-hot. But if the data is a SOFT_POINTER, the representation
+  is already expanded and we just overwrite the type as POINTER so that
+  it is treated as such for encoding.
+  Args:
+    dp: A DataPoint to prepare for encoding.
+    nb_nodes: Number of nodes in the graph, necessary to expand pointers to
+      the right dimension.
+  Returns:
+    The datapoint, with data and possibly type modified.
+  """
+  new_type = dp.type_
+  if dp.type_ == _Type.POINTER:
+    data = hk.one_hot(dp.data, nb_nodes)
+  else:
+    data = dp.data.astype(jnp.float32)
+    if dp.type_ == _Type.SOFT_POINTER:
+      new_type = _Type.POINTER
+  dp = probing.DataPoint(
+      name=dp.name, location=dp.location, type_=new_type, data=data)
+  return dp
+def accum_adj_mat(dp: _DataPoint, adj_mat: _Array) -> _Array:
+  """Accumulates adjacency matrix."""
+  if dp.location == _Location.NODE and dp.type_ in [_Type.POINTER,
+                                                    _Type.PERMUTATION_POINTER]:
+    adj_mat += ((dp.data + jnp.transpose(dp.data, (0, 2, 1))) > 0.5)
+  elif dp.location == _Location.EDGE and dp.type_ == _Type.MASK:
+    adj_mat += ((dp.data + jnp.transpose(dp.data, (0, 2, 1))) > 0.0)
+  return (adj_mat > 0.).astype('float32')  # pytype: disable=attribute-error  # numpy-scalars
+def accum_edge_fts(encoders, dp: _DataPoint, edge_fts: _Array) -> _Array:
+  """Encodes and accumulates edge features."""
+  if dp.location == _Location.NODE and dp.type_ in [_Type.POINTER,
+                                                    _Type.PERMUTATION_POINTER]:
+    encoding = _encode_inputs(encoders, dp)
+    edge_fts += encoding
+  elif dp.location == _Location.EDGE:
+    encoding = _encode_inputs(encoders, dp)
+    if dp.type_ == _Type.POINTER:
+      # Aggregate pointer contributions across sender and receiver nodes.
+      encoding_2 = encoders[1](jnp.expand_dims(dp.data, -1))
+      edge_fts += jnp.mean(encoding, axis=1) + jnp.mean(encoding_2, axis=2)
+    else:
+      edge_fts += encoding
+  return edge_fts
+def accum_node_fts(encoders, dp: _DataPoint, node_fts: _Array) -> _Array:
+  """Encodes and accumulates node features."""
+  is_pointer = (dp.type_ in [_Type.POINTER, _Type.PERMUTATION_POINTER])
+  if ((dp.location == _Location.NODE and not is_pointer) or
+      (dp.location == _Location.GRAPH and dp.type_ == _Type.POINTER)):
+    encoding = _encode_inputs(encoders, dp)
+    node_fts += encoding
+  return node_fts
+def accum_graph_fts(encoders, dp: _DataPoint,
+                    graph_fts: _Array) -> _Array:
+  """Encodes and accumulates graph features."""
+  if dp.location == _Location.GRAPH and dp.type_ != _Type.POINTER:
+    encoding = _encode_inputs(encoders, dp)
+    graph_fts += encoding
+  return graph_fts
+def _encode_inputs(encoders, dp: _DataPoint) -> _Array:
+  if dp.type_ == _Type.CATEGORICAL:
+    encoding = encoders[0](dp.data)
+  else:
+    encoding = encoders[0](jnp.expand_dims(dp.data, -1))
+  return encoding

benchmarks/CLRS/env/evaluation.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model base classes and utilities."""
+from typing import Dict, List, Tuple
+import chex
+from clrs._src import probing
+from clrs._src import specs
+import numpy as np
+_Array = chex.Array
+Result = Dict[str, probing.DataPoint]
+def fuse_perm_and_mask(perm: probing.DataPoint,
+                       mask: probing.DataPoint) -> probing.DataPoint:
+  """Replace permutation pointers active in the mask with self-pointers.
+  Args:
+    perm: a node permutation_pointer; data shape is expected to be
+      [..., N, N], and ideally one-hot over the last two dimensions, although
+      this method does not check for one-hotness.
+    mask: a mask_one over nodes; data shape is expected to be
+      [..., N], and ideally one-hot over the last dimension, although
+      this method does not check for one-hotness.
+  Returns:
+    A node pointer with shape [..., N].
+  """
+  assert perm.type_ == specs.Type.PERMUTATION_POINTER
+  assert perm.location == specs.Location.NODE
+  assert mask.name == perm.name + '_mask'
+  assert mask.type_ == specs.Type.MASK_ONE
+  assert mask.location == specs.Location.NODE
+  assert perm.data.shape[-1] == perm.data.shape[-2]
+  assert perm.data.shape[:-1] == mask.data.shape
+  data = np.where(mask.data > 0.5,
+                  np.arange(perm.data.shape[-1]),  # self-pointers
+                  np.argmax(perm.data, axis=-1))   # original pointers
+  return probing.DataPoint(name=perm.name,
+                           type_=specs.Type.POINTER,
+                           location=perm.location,
+                           data=data)
+def _reduce_permutations_tuple(
+    targets: Tuple[probing.DataPoint, ...]) -> Tuple[probing.DataPoint, ...]:
+  """Reduce node pointer + mask_one permutation to just node pointer."""
+  out_targets = []
+  n_perms = 0
+  i = 0
+  while i < len(targets):
+    truth = targets[i]
+    if truth.type_ != specs.Type.PERMUTATION_POINTER:
+      out_targets.append(truth)
+      i += 1
+      continue
+    truth_mask = targets[i + 1]
+    out_targets.append(fuse_perm_and_mask(truth, truth_mask))
+    i += 2
+    n_perms += 1
+  assert len(out_targets) == len(targets) - n_perms
+  return tuple(out_targets)
+def _reduce_permutations_dict(predictions: Result) -> Result:
+  """Reduce node pointer + mask_one permutation to just node pointer."""
+  out_preds = {}
+  n_perms = 0
+  for k, pred in predictions.items():
+    if (k.endswith('_mask') and k[:-5] in predictions and
+        predictions[k[:-5]].type_ == specs.Type.PERMUTATION_POINTER):
+      # This mask will be processed with its associated permutation datapoint
+      continue
+    if pred.type_ != specs.Type.PERMUTATION_POINTER:
+      out_preds[k] = pred
+      continue
+    pred_mask = predictions[k + '_mask']
+    out_preds[k] = fuse_perm_and_mask(pred, pred_mask)
+    n_perms += 1
+  assert len(out_preds) == len(predictions) - n_perms
+  return out_preds
+def evaluate_hints(
+    hints: Tuple[probing.DataPoint, ...],
+    lengths: _Array,
+    hint_preds: List[Result],
+) -> Dict[str, _Array]:
+  """Evaluate hint predictions."""
+  evals = {}
+  hints = _reduce_permutations_tuple(hints)
+  hint_preds = [_reduce_permutations_dict(h) for h in hint_preds]
+  for truth in hints:
+    assert truth.name in hint_preds[0]
+    eval_along_time = [_evaluate(truth, p[truth.name],
+                                 idx=i+1, lengths=lengths)
+                       for (i, p) in enumerate(hint_preds)]
+    evals[truth.name] = np.sum(
+        [x * np.sum(i+1 < lengths)
+         for i, x in enumerate(eval_along_time)]) / np.sum(lengths - 1)
+    evals[truth.name + '_along_time'] = np.array(eval_along_time)
+  # Unlike outputs, the hints sometimes include scalars, which don't have
+  # a meaningful eval score. So we don't compute a global 'hint score' as we
+  # do for outputs.
+  return evals
+def evaluate(
+    outputs: Tuple[probing.DataPoint, ...],
+    predictions: Result,
+) -> Dict[str, float]:
+  """Evaluate output predictions."""
+  evals = {}
+  outputs = _reduce_permutations_tuple(outputs)
+  predictions = _reduce_permutations_dict(predictions)
+  for truth in outputs:
+    assert truth.name in predictions
+    pred = predictions[truth.name]
+    evals[truth.name] = _evaluate(truth, pred)
+  # Return a single scalar score that is the mean of all output scores.
+  evals['score'] = sum([v.item() for v in evals.values()]) / len(evals)
+  return evals
+def _evaluate(truth, pred, idx=None, lengths=None):
+  """Evaluate single prediction of hint or output."""
+  assert pred.name == truth.name
+  assert pred.location == truth.location
+  assert pred.type_ == truth.type_
+  if truth.type_ not in _EVAL_FN:
+    raise ValueError('Invalid type')
+  truth_data = truth.data
+  pred_data = pred.data
+  if idx is not None:
+    if np.all(idx >= lengths):
+      return 0.
+    truth_data = truth_data[idx][idx < lengths]
+    pred_data = pred_data[idx < lengths]
+  return _EVAL_FN[truth.type_](pred_data, truth_data)
+def _eval_one(pred, truth):
+  mask = np.all(truth != specs.OutputClass.MASKED, axis=-1)
+  return np.sum(
+      (np.argmax(pred, -1) == np.argmax(truth, -1)) * mask) / np.sum(mask)
+def _mask_fn(pred, truth):
+  """Evaluate outputs of type MASK, and account for any class imbalance."""
+  mask = (truth != specs.OutputClass.MASKED).astype(np.float32)
+  # Use F1 score for the masked outputs to address any imbalance
+  tp = np.sum((((pred > 0.5) * (truth > 0.5)) * 1.0) * mask)
+  fp = np.sum((((pred > 0.5) * (truth < 0.5)) * 1.0) * mask)
+  fn = np.sum((((pred < 0.5) * (truth > 0.5)) * 1.0) * mask)
+  # Protect against division by zero
+  if tp + fp > 0:
+    precision = tp / (tp + fp)
+  else:
+    precision = np.float32(1.0)
+  if tp + fn > 0:
+    recall = tp / (tp + fn)
+  else:
+    recall = np.float32(1.0)
+  if precision + recall > 0.0:
+    f_1 = 2.0 * precision * recall / (precision + recall)
+  else:
+    f_1 = np.float32(0.0)
+  return f_1
+_EVAL_FN = {
+    specs.Type.SCALAR:
+        lambda pred, truth: np.mean((pred - truth)**2),
+    specs.Type.MASK: _mask_fn,
+    specs.Type.MASK_ONE:
+        _eval_one,
+    specs.Type.CATEGORICAL:
+        _eval_one,
+    specs.Type.POINTER:
+        lambda pred, truth: np.mean((pred == truth) * 1.0),
+}

benchmarks/CLRS/env/evaluation_test.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright 2022 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for `evaluation.py`."""
+from absl.testing import absltest
+from clrs._src import evaluation
+from clrs._src import probing
+from clrs._src import specs
+import jax
+import jax.numpy as jnp
+import numpy as np
+class EvaluationTest(absltest.TestCase):
+  def test_reduce_permutations(self):
+    b = 8
+    n = 16
+    pred = jnp.stack([jax.random.permutation(jax.random.PRNGKey(i), n)
+                      for i in range(b)])
+    heads = jax.random.randint(jax.random.PRNGKey(42), (b,), 0, n)
+    perm = probing.DataPoint(name='test',
+                             type_=specs.Type.PERMUTATION_POINTER,
+                             location=specs.Location.NODE,
+                             data=jax.nn.one_hot(pred, n))
+    mask = probing.DataPoint(name='test_mask',
+                             type_=specs.Type.MASK_ONE,
+                             location=specs.Location.NODE,
+                             data=jax.nn.one_hot(heads, n))
+    output = evaluation.fuse_perm_and_mask(perm=perm, mask=mask)
+    expected_output = np.array(pred)
+    expected_output[np.arange(b), heads] = heads
+    self.assertEqual(output.name, 'test')
+    self.assertEqual(output.type_, specs.Type.POINTER)
+    self.assertEqual(output.location, specs.Location.NODE)
+    np.testing.assert_allclose(output.data, expected_output)
+if __name__ == '__main__':
+  absltest.main()

benchmarks/CLRS/env/losses.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for calculating losses."""
+from typing import Dict, List, Tuple
+import chex
+from clrs._src import probing
+from clrs._src import specs
+import haiku as hk
+import jax
+import jax.numpy as jnp
+_Array = chex.Array
+_DataPoint = probing.DataPoint
+_Location = specs.Location
+_OutputClass = specs.OutputClass
+_PredTrajectory = Dict[str, _Array]
+_PredTrajectories = List[_PredTrajectory]
+_Type = specs.Type
+EPS = 1e-12
+def _expand_to(x: _Array, y: _Array) -> _Array:
+  while len(y.shape) > len(x.shape):
+    x = jnp.expand_dims(x, -1)
+  return x
+def _expand_and_broadcast_to(x: _Array, y: _Array) -> _Array:
+  return jnp.broadcast_to(_expand_to(x, y), y.shape)
+def output_loss_chunked(truth: _DataPoint, pred: _Array,
+                        is_last: _Array, nb_nodes: int) -> float:
+  """Output loss for time-chunked training."""
+  mask = None
+  if truth.type_ == _Type.SCALAR:
+    loss = (pred - truth.data)**2
+  elif truth.type_ == _Type.MASK:
+    loss = (
+        jnp.maximum(pred, 0) - pred * truth.data +
+        jnp.log1p(jnp.exp(-jnp.abs(pred))))
+    mask = (truth.data != _OutputClass.MASKED)
+  elif truth.type_ in [_Type.MASK_ONE, _Type.CATEGORICAL]:
+    mask = jnp.any(truth.data == _OutputClass.POSITIVE, axis=-1)
+    masked_truth = truth.data * (truth.data != _OutputClass.MASKED).astype(
+        jnp.float32)
+    loss = -jnp.sum(masked_truth * jax.nn.log_softmax(pred), axis=-1)
+  elif truth.type_ == _Type.POINTER:
+    loss = -jnp.sum(
+        hk.one_hot(truth.data, nb_nodes) * jax.nn.log_softmax(pred), axis=-1)
+  elif truth.type_ == _Type.PERMUTATION_POINTER:
+    # Predictions are NxN logits aiming to represent a doubly stochastic matrix.
+    # Compute the cross entropy between doubly stochastic pred and truth_data
+    loss = -jnp.sum(truth.data * pred, axis=-1)
+  if mask is not None:
+    mask = mask * _expand_and_broadcast_to(is_last, loss)
+  else:
+    mask = _expand_and_broadcast_to(is_last, loss)
+  total_mask = jnp.maximum(jnp.sum(mask), EPS)
+  return jnp.sum(jnp.where(mask, loss, 0.0)) / total_mask
+def output_loss(truth: _DataPoint, pred: _Array, nb_nodes: int) -> float:
+  """Output loss for full-sample training."""
+  if truth.type_ == _Type.SCALAR:
+    total_loss = jnp.mean((pred - truth.data)**2)
+  elif truth.type_ == _Type.MASK:
+    loss = (
+        jnp.maximum(pred, 0) - pred * truth.data +
+        jnp.log1p(jnp.exp(-jnp.abs(pred))))
+    mask = (truth.data != _OutputClass.MASKED).astype(jnp.float32)
+    total_loss = jnp.sum(loss * mask) / jnp.sum(mask)
+  elif truth.type_ in [_Type.MASK_ONE, _Type.CATEGORICAL]:
+    masked_truth = truth.data * (truth.data != _OutputClass.MASKED).astype(
+        jnp.float32)
+    total_loss = (-jnp.sum(masked_truth * jax.nn.log_softmax(pred)) /
+                  jnp.sum(truth.data == _OutputClass.POSITIVE))
+  elif truth.type_ == _Type.POINTER:
+    total_loss = (
+        jnp.mean(-jnp.sum(
+            hk.one_hot(truth.data, nb_nodes) * jax.nn.log_softmax(pred),
+            axis=-1)))
+  elif truth.type_ == _Type.PERMUTATION_POINTER:
+    # Predictions are NxN logits aiming to represent a doubly stochastic matrix.
+    # Compute the cross entropy between doubly stochastic pred and truth_data
+    total_loss = jnp.mean(-jnp.sum(truth.data * pred, axis=-1))
+  return total_loss
+def hint_loss_chunked(
+    truth: _DataPoint,
+    pred: _Array,
+    is_first: _Array,
+    nb_nodes: int,
+):
+  """Hint loss for time-chunked training."""
+  loss, mask = _hint_loss(
+      truth_data=truth.data,
+      truth_type=truth.type_,
+      pred=pred,
+      nb_nodes=nb_nodes,
+  )
+  mask *= (1 - _expand_to(is_first, loss)).astype(jnp.float32)
+  loss = jnp.sum(loss * mask) / jnp.maximum(jnp.sum(mask), EPS)
+  return loss
+def hint_loss(
+    truth: _DataPoint,
+    preds: List[_Array],
+    lengths: _Array,
+    nb_nodes: int,
+    verbose: bool = False,
+):
+  """Hint loss for full-sample training."""
+  total_loss = 0.
+  verbose_loss = {}
+  length = truth.data.shape[0] - 1
+  loss, mask = _hint_loss(
+      truth_data=truth.data[1:],
+      truth_type=truth.type_,
+      pred=jnp.stack(preds),
+      nb_nodes=nb_nodes,
+  )
+  mask *= _is_not_done_broadcast(lengths, jnp.arange(length)[:, None], loss)
+  loss = jnp.sum(loss * mask) / jnp.maximum(jnp.sum(mask), EPS)
+  if verbose:
+    verbose_loss['loss_' + truth.name] = loss
+  else:
+    total_loss += loss
+  return verbose_loss if verbose else total_loss
+def _hint_loss(
+    truth_data: _Array,
+    truth_type: str,
+    pred: _Array,
+    nb_nodes: int,
+) -> Tuple[_Array, _Array]:
+  """Hint loss helper."""
+  mask = None
+  if truth_type == _Type.SCALAR:
+    loss = (pred - truth_data)**2
+  elif truth_type == _Type.MASK:
+    loss = (jnp.maximum(pred, 0) - pred * truth_data +
+            jnp.log1p(jnp.exp(-jnp.abs(pred))))
+    mask = (truth_data != _OutputClass.MASKED).astype(jnp.float32)  # pytype: disable=attribute-error  # numpy-scalars
+  elif truth_type == _Type.MASK_ONE:
+    loss = -jnp.sum(truth_data * jax.nn.log_softmax(pred), axis=-1,
+                    keepdims=True)
+  elif truth_type == _Type.CATEGORICAL:
+    loss = -jnp.sum(truth_data * jax.nn.log_softmax(pred), axis=-1)
+    mask = jnp.any(truth_data == _OutputClass.POSITIVE, axis=-1).astype(
+        jnp.float32)
+  elif truth_type == _Type.POINTER:
+    loss = -jnp.sum(
+        hk.one_hot(truth_data, nb_nodes) * jax.nn.log_softmax(pred),
+        axis=-1)
+  elif truth_type == _Type.PERMUTATION_POINTER:
+    # Predictions are NxN logits aiming to represent a doubly stochastic matrix.
+    # Compute the cross entropy between doubly stochastic pred and truth_data
+    loss = -jnp.sum(truth_data * pred, axis=-1)
+  if mask is None:
+    mask = jnp.ones_like(loss)
+  return loss, mask
+def _is_not_done_broadcast(lengths, i, tensor):
+  is_not_done = (lengths > i + 1) * 1.0
+  while len(is_not_done.shape) < len(tensor.shape):  # pytype: disable=attribute-error  # numpy-scalars
+    is_not_done = jnp.expand_dims(is_not_done, -1)
+  return is_not_done

benchmarks/CLRS/env/losses_test.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Copyright 2022 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for `losses.py`."""
+from typing import Generator
+from absl.testing import absltest
+from absl.testing import parameterized
+from clrs._src import dataset
+from clrs._src import losses
+from clrs._src import probing
+from clrs._src import samplers
+from clrs._src import specs
+import jax
+import jax.numpy as jnp
+import numpy as np
+_Array = np.ndarray
+_Location = specs.Location
+def _make_sampler(algo: str, nb_nodes: int) -> samplers.Sampler:
+  sampler, _ = samplers.build_sampler(
+      algo,
+      seed=samplers.CLRS30['val']['seed'],
+      num_samples=samplers.CLRS30['val']['num_samples'],
+      length=nb_nodes,
+  )
+  return sampler
+def _make_iterable_sampler(
+    algo: str, batch_size: int,
+    nb_nodes: int) -> Generator[samplers.Feedback, None, None]:
+  sampler = _make_sampler(algo, nb_nodes)
+  while True:
+    yield sampler.next(batch_size)
+def _as_pred_data(x, nb_nodes, seed, batch_axis):
+  """Fake a prediction from a data point."""
+  # Permute along batch axis to make the prediction different.
+  key = jax.random.PRNGKey(seed)
+  data = jax.random.permutation(key, x.data, axis=batch_axis)
+  # Extend to one-hot for pointer types.
+  if x.type_ == specs.Type.POINTER:
+    return jax.nn.one_hot(data, nb_nodes)
+  return data
+def _mask_datapoint(x, seed, t_axis=None):
+  """Add some masking to data."""
+  key = jax.random.PRNGKey(seed)
+  data = x.data
+  if x.type_ == specs.Type.MASK:
+    # mask some data at random
+    mask_shape = list(data.shape)
+    if t_axis is not None:
+      mask_shape[t_axis] = 1
+    mask = jax.random.uniform(key, tuple(mask_shape)) < 0.2
+    data = jnp.where(mask, specs.OutputClass.MASKED, data)
+  elif x.type_ in [specs.Type.CATEGORICAL, specs.Type.MASK_ONE]:
+    # mask some data at random (all categories together)
+    mask_shape = list(data.shape)[:-1]
+    if t_axis is not None:
+      mask_shape[t_axis] = 1
+    mask = jax.random.uniform(key, tuple(mask_shape)) < 0.2
+    data = jnp.where(mask[..., None], specs.OutputClass.MASKED, data)
+  return probing.DataPoint(name=x.name, location=x.location, type_=x.type_,
+                           data=data)
+def _rand_diff(seed, shape):
+  return 2.0 * jax.random.uniform(jax.random.PRNGKey(seed), shape) - 1.0
+def _rand_mask(seed, shape, p=0.5):
+  return (jax.random.uniform(jax.random.PRNGKey(seed), shape) > p).astype(float)
+def invert(d):
+  """Dict of lists -> list of dicts."""
+  if d:
+    return [dict(zip(d, i)) for i in zip(*d.values())]
+def _create_data(algo, nb_nodes):
+  batch_size = 8
+  ds = _make_iterable_sampler(algo, batch_size, nb_nodes)
+  full_sample = next(ds)
+  chunk_length = full_sample.features.lengths[0].astype(int)
+  chunked_ds = dataset.chunkify(
+      _make_iterable_sampler(algo, batch_size, nb_nodes),
+      chunk_length)
+  chunk_sample = next(chunked_ds)
+  return full_sample, chunk_sample
+class FullVsChunkLossesTest(parameterized.TestCase):
+  """Test that the full and chunked versions of the losses match."""
+  # Test two algorithms with fixed-length, covering all data types
+  @parameterized.parameters('dfs', 'floyd_warshall')
+  def test_output_loss(self, algo):
+    nb_nodes = 16
+    full_sample, chunk_sample = _create_data(algo, nb_nodes)
+    # Calculate output loss.
+    for truth_full, truth_chunked in zip(full_sample.outputs,
+                                         chunk_sample.outputs):
+      chunk_output_loss = losses.output_loss_chunked(
+          truth=_mask_datapoint(truth_chunked, seed=0),
+          pred=_as_pred_data(truth_chunked, nb_nodes, 0, 1),
+          is_last=chunk_sample.features.is_last,
+          nb_nodes=nb_nodes,
+      )
+      full_output_loss = losses.output_loss(
+          truth=_mask_datapoint(truth_full, seed=0),
+          pred=_as_pred_data(truth_full, nb_nodes, 0, 0),
+          nb_nodes=nb_nodes,
+      )
+      np.testing.assert_allclose(chunk_output_loss, full_output_loss, rtol=1e-4)
+  @parameterized.parameters('dfs', 'floyd_warshall')
+  def test_hint_loss(self, algo):
+    nb_nodes = 16
+    full_sample, chunk_sample = _create_data(algo, nb_nodes)
+    for truth_full, truth_chunked in zip(full_sample.features.hints,
+                                         chunk_sample.features.hints):
+      np.testing.assert_array_equal(truth_full.data, truth_chunked.data)
+      pred = _as_pred_data(truth_chunked, nb_nodes, 0, 1)
+      chunk_hint_loss = losses.hint_loss_chunked(
+          truth=_mask_datapoint(truth_chunked, seed=1, t_axis=0),
+          pred=pred,
+          is_first=chunk_sample.features.is_first,
+          nb_nodes=nb_nodes,
+      )
+      full_preds = pred[1:]
+      full_hint_loss = losses.hint_loss(
+          truth=_mask_datapoint(truth_full, 1, t_axis=0),
+          preds=full_preds,
+          lengths=full_sample.features.lengths,
+          nb_nodes=nb_nodes,
+      )
+      np.testing.assert_allclose(chunk_hint_loss, full_hint_loss, rtol=1e-4)
+if __name__ == '__main__':
+  absltest.main()

benchmarks/CLRS/env/model.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model base classes and utilities."""
+import abc
+from typing import Dict, List, Optional, Union
+from clrs._src import probing
+from clrs._src import samplers
+from clrs._src import specs
+Result = Dict[str, probing.DataPoint]
+class Model(abc.ABC):
+  """Abstract base class for CLRS3-B models."""
+  def __init__(self, spec: Union[specs.Spec, List[specs.Spec]]):
+    """Set up the problem, prepare to predict on first task."""
+    if not isinstance(spec, list):
+      spec = [spec]
+    self._spec = spec
+  @abc.abstractmethod
+  def predict(self, features: samplers.Features) -> Result:
+    """Make predictions about the current task."""
+    pass
+  @abc.abstractmethod
+  def feedback(self, feedback: Optional[samplers.Feedback]):
+    """Advance to the next task, incorporating any available feedback."""
+    pass

benchmarks/CLRS/env/nets.py ADDED Viewed

	@@ -0,0 +1,719 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""JAX implementation of CLRS basic network."""
+import functools
+from typing import Dict, List, Optional, Tuple
+import chex
+from clrs._src import decoders
+from clrs._src import encoders
+from clrs._src import probing
+from clrs._src import processors
+from clrs._src import samplers
+from clrs._src import specs
+import haiku as hk
+import jax
+import jax.numpy as jnp
+_Array = chex.Array
+_DataPoint = probing.DataPoint
+_Features = samplers.Features
+_FeaturesChunked = samplers.FeaturesChunked
+_Location = specs.Location
+_Spec = specs.Spec
+_Stage = specs.Stage
+_Trajectory = samplers.Trajectory
+_Type = specs.Type
+@chex.dataclass
+class _MessagePassingScanState:
+  hint_preds: chex.Array
+  output_preds: chex.Array
+  hiddens: chex.Array
+  lstm_state: Optional[hk.LSTMState]
+@chex.dataclass
+class _MessagePassingOutputChunked:
+  hint_preds: chex.Array
+  output_preds: chex.Array
+@chex.dataclass
+class MessagePassingStateChunked:
+  inputs: chex.Array
+  hints: chex.Array
+  is_first: chex.Array
+  hint_preds: chex.Array
+  hiddens: chex.Array
+  lstm_state: Optional[hk.LSTMState]
+class Net(hk.Module):
+  """Building blocks (networks) used to encode and decode messages."""
+  def __init__(
+      self,
+      spec: List[_Spec],
+      hidden_dim: int,
+      encode_hints: bool,
+      decode_hints: bool,
+      processor_factory: processors.ProcessorFactory,
+      use_lstm: bool,
+      encoder_init: str,
+      dropout_prob: float,
+      hint_teacher_forcing: float,
+      hint_repred_mode='soft',
+      nb_dims=None,
+      nb_msg_passing_steps=1,
+      name: str = 'net',
+  ):
+    """Constructs a `Net`."""
+    super().__init__(name=name)
+    self._dropout_prob = dropout_prob
+    self._hint_teacher_forcing = hint_teacher_forcing
+    self._hint_repred_mode = hint_repred_mode
+    self.spec = spec
+    self.hidden_dim = hidden_dim
+    self.encode_hints = encode_hints
+    self.decode_hints = decode_hints
+    self.processor_factory = processor_factory
+    self.nb_dims = nb_dims
+    self.use_lstm = use_lstm
+    self.encoder_init = encoder_init
+    self.nb_msg_passing_steps = nb_msg_passing_steps
+  def _msg_passing_step(self,
+                        mp_state: _MessagePassingScanState,
+                        i: int,
+                        hints: List[_DataPoint],
+                        repred: bool,
+                        lengths: chex.Array,
+                        batch_size: int,
+                        nb_nodes: int,
+                        inputs: _Trajectory,
+                        first_step: bool,
+                        spec: _Spec,
+                        encs: Dict[str, List[hk.Module]],
+                        decs: Dict[str, Tuple[hk.Module]],
+                        return_hints: bool,
+                        return_all_outputs: bool
+                        ):
+    if self.decode_hints and not first_step:
+      assert self._hint_repred_mode in ['soft', 'hard', 'hard_on_eval']
+      hard_postprocess = (self._hint_repred_mode == 'hard' or
+                          (self._hint_repred_mode == 'hard_on_eval' and repred))
+      decoded_hint = decoders.postprocess(spec,
+                                          mp_state.hint_preds,
+                                          sinkhorn_temperature=0.1,
+                                          sinkhorn_steps=25,
+                                          hard=hard_postprocess)
+    if repred and self.decode_hints and not first_step:
+      cur_hint = []
+      for hint in decoded_hint:
+        cur_hint.append(decoded_hint[hint])
+    else:
+      cur_hint = []
+      needs_noise = (self.decode_hints and not first_step and
+                     self._hint_teacher_forcing < 1.0)
+      if needs_noise:
+        # For noisy teacher forcing, choose which examples in the batch to force
+        force_mask = jax.random.bernoulli(
+            hk.next_rng_key(), self._hint_teacher_forcing,
+            (batch_size,))
+      else:
+        force_mask = None
+      for hint in hints:
+        hint_data = jnp.asarray(hint.data)[i]
+        _, loc, typ = spec[hint.name]
+        if needs_noise:
+          if (typ == _Type.POINTER and
+              decoded_hint[hint.name].type_ == _Type.SOFT_POINTER):
+            # When using soft pointers, the decoded hints cannot be summarised
+            # as indices (as would happen in hard postprocessing), so we need
+            # to raise the ground-truth hint (potentially used for teacher
+            # forcing) to its one-hot version.
+            hint_data = hk.one_hot(hint_data, nb_nodes)
+            typ = _Type.SOFT_POINTER
+          hint_data = jnp.where(_expand_to(force_mask, hint_data),
+                                hint_data,
+                                decoded_hint[hint.name].data)
+        cur_hint.append(
+            probing.DataPoint(
+                name=hint.name, location=loc, type_=typ, data=hint_data))
+    hiddens, output_preds_cand, hint_preds, lstm_state = self._one_step_pred(
+        inputs, cur_hint, mp_state.hiddens,
+        batch_size, nb_nodes, mp_state.lstm_state,
+        spec, encs, decs, repred)
+    if first_step:
+      output_preds = output_preds_cand
+    else:
+      output_preds = {}
+      for outp in mp_state.output_preds:
+        is_not_done = _is_not_done_broadcast(lengths, i,
+                                             output_preds_cand[outp])
+        output_preds[outp] = is_not_done * output_preds_cand[outp] + (
+            1.0 - is_not_done) * mp_state.output_preds[outp]
+    new_mp_state = _MessagePassingScanState(  # pytype: disable=wrong-arg-types  # numpy-scalars
+        hint_preds=hint_preds,
+        output_preds=output_preds,
+        hiddens=hiddens,
+        lstm_state=lstm_state)
+    # Save memory by not stacking unnecessary fields
+    accum_mp_state = _MessagePassingScanState(  # pytype: disable=wrong-arg-types  # numpy-scalars
+        hint_preds=hint_preds if return_hints else None,
+        output_preds=output_preds if return_all_outputs else None,
+        hiddens=None, lstm_state=None)
+    # Complying to jax.scan, the first returned value is the state we carry over
+    # the second value is the output that will be stacked over steps.
+    return new_mp_state, accum_mp_state
+  def __call__(self, features_list: List[_Features], repred: bool,
+               algorithm_index: int,
+               return_hints: bool,
+               return_all_outputs: bool):
+    """Process one batch of data.
+    Args:
+      features_list: A list of _Features objects, each with the inputs, hints
+        and lengths for a batch o data corresponding to one algorithm.
+        The list should have either length 1, at train/evaluation time,
+        or length equal to the number of algorithms this Net is meant to
+        process, at initialization.
+      repred: False during training, when we have access to ground-truth hints.
+        True in validation/test mode, when we have to use our own
+        hint predictions.
+      algorithm_index: Which algorithm is being processed. It can be -1 at
+        initialisation (either because we are initialising the parameters of
+        the module or because we are intialising the message-passing state),
+        meaning that all algorithms should be processed, in which case
+        `features_list` should have length equal to the number of specs of
+        the Net. Otherwise, `algorithm_index` should be
+        between 0 and `length(self.spec) - 1`, meaning only one of the
+        algorithms will be processed, and `features_list` should have length 1.
+      return_hints: Whether to accumulate and return the predicted hints,
+        when they are decoded.
+      return_all_outputs: Whether to return the full sequence of outputs, or
+        just the last step's output.
+    Returns:
+      A 2-tuple with (output predictions, hint predictions)
+      for the selected algorithm.
+    """
+    if algorithm_index == -1:
+      algorithm_indices = range(len(features_list))
+    else:
+      algorithm_indices = [algorithm_index]
+    assert len(algorithm_indices) == len(features_list)
+    self.encoders, self.decoders = self._construct_encoders_decoders()
+    self.processor = self.processor_factory(self.hidden_dim)
+    # Optionally construct LSTM.
+    if self.use_lstm:
+      self.lstm = hk.LSTM(
+          hidden_size=self.hidden_dim,
+          name='processor_lstm')
+      lstm_init = self.lstm.initial_state
+    else:
+      self.lstm = None
+      lstm_init = lambda x: 0
+    for algorithm_index, features in zip(algorithm_indices, features_list):
+      inputs = features.inputs
+      hints = features.hints
+      lengths = features.lengths
+      batch_size, nb_nodes = _data_dimensions(features)
+      nb_mp_steps = max(1, hints[0].data.shape[0] - 1)
+      hiddens = jnp.zeros((batch_size, nb_nodes, self.hidden_dim))
+      if self.use_lstm:
+        lstm_state = lstm_init(batch_size * nb_nodes)
+        lstm_state = jax.tree_util.tree_map(
+            lambda x, b=batch_size, n=nb_nodes: jnp.reshape(x, [b, n, -1]),
+            lstm_state)
+      else:
+        lstm_state = None
+      mp_state = _MessagePassingScanState(  # pytype: disable=wrong-arg-types  # numpy-scalars
+          hint_preds=None, output_preds=None,
+          hiddens=hiddens, lstm_state=lstm_state)
+      # Do the first step outside of the scan because it has a different
+      # computation graph.
+      common_args = dict(
+          hints=hints,
+          repred=repred,
+          inputs=inputs,
+          batch_size=batch_size,
+          nb_nodes=nb_nodes,
+          lengths=lengths,
+          spec=self.spec[algorithm_index],
+          encs=self.encoders[algorithm_index],
+          decs=self.decoders[algorithm_index],
+          return_hints=return_hints,
+          return_all_outputs=return_all_outputs,
+          )
+      mp_state, lean_mp_state = self._msg_passing_step(
+          mp_state,
+          i=0,
+          first_step=True,
+          **common_args)
+      # Then scan through the rest.
+      scan_fn = functools.partial(
+          self._msg_passing_step,
+          first_step=False,
+          **common_args)
+      output_mp_state, accum_mp_state = hk.scan(
+          scan_fn,
+          mp_state,
+          jnp.arange(nb_mp_steps - 1) + 1,
+          length=nb_mp_steps - 1)
+    # We only return the last algorithm's output. That's because
+    # the output only matters when a single algorithm is processed; the case
+    # `algorithm_index==-1` (meaning all algorithms should be processed)
+    # is used only to init parameters.
+    accum_mp_state = jax.tree_util.tree_map(
+        lambda init, tail: jnp.concatenate([init[None], tail], axis=0),
+        lean_mp_state, accum_mp_state)
+    def invert(d):
+      """Dict of lists -> list of dicts."""
+      if d:
+        return [dict(zip(d, i)) for i in zip(*d.values())]
+    if return_all_outputs:
+      output_preds = {k: jnp.stack(v)
+                      for k, v in accum_mp_state.output_preds.items()}
+    else:
+      output_preds = output_mp_state.output_preds
+    hint_preds = invert(accum_mp_state.hint_preds)
+    return output_preds, hint_preds
+  def _construct_encoders_decoders(self):
+    """Constructs encoders and decoders, separate for each algorithm."""
+    encoders_ = []
+    decoders_ = []
+    enc_algo_idx = None
+    for (algo_idx, spec) in enumerate(self.spec):
+      enc = {}
+      dec = {}
+      for name, (stage, loc, t) in spec.items():
+        if stage == _Stage.INPUT or (
+            stage == _Stage.HINT and self.encode_hints):
+          # Build input encoders.
+          if name == specs.ALGO_IDX_INPUT_NAME:
+            if enc_algo_idx is None:
+              enc_algo_idx = [hk.Linear(self.hidden_dim,
+                                        name=f'{name}_enc_linear')]
+            enc[name] = enc_algo_idx
+          else:
+            enc[name] = encoders.construct_encoders(
+                stage, loc, t, hidden_dim=self.hidden_dim,
+                init=self.encoder_init,
+                name=f'algo_{algo_idx}_{name}')
+        if stage == _Stage.OUTPUT or (
+            stage == _Stage.HINT and self.decode_hints):
+          # Build output decoders.
+          dec[name] = decoders.construct_decoders(
+              loc, t, hidden_dim=self.hidden_dim,
+              nb_dims=self.nb_dims[algo_idx][name],
+              name=f'algo_{algo_idx}_{name}')
+      encoders_.append(enc)
+      decoders_.append(dec)
+    return encoders_, decoders_
+  def _one_step_pred(
+      self,
+      inputs: _Trajectory,
+      hints: _Trajectory,
+      hidden: _Array,
+      batch_size: int,
+      nb_nodes: int,
+      lstm_state: Optional[hk.LSTMState],
+      spec: _Spec,
+      encs: Dict[str, List[hk.Module]],
+      decs: Dict[str, Tuple[hk.Module]],
+      repred: bool,
+  ):
+    """Generates one-step predictions."""
+    # Initialise empty node/edge/graph features and adjacency matrix.
+    node_fts = jnp.zeros((batch_size, nb_nodes, self.hidden_dim))
+    edge_fts = jnp.zeros((batch_size, nb_nodes, nb_nodes, self.hidden_dim))
+    graph_fts = jnp.zeros((batch_size, self.hidden_dim))
+    adj_mat = jnp.repeat(
+        jnp.expand_dims(jnp.eye(nb_nodes), 0), batch_size, axis=0)
+    # ENCODE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # Encode node/edge/graph features from inputs and (optionally) hints.
+    trajectories = [inputs]
+    if self.encode_hints:
+      trajectories.append(hints)
+    for trajectory in trajectories:
+      for dp in trajectory:
+        try:
+          dp = encoders.preprocess(dp, nb_nodes)
+          assert dp.type_ != _Type.SOFT_POINTER
+          adj_mat = encoders.accum_adj_mat(dp, adj_mat)
+          encoder = encs[dp.name]
+          edge_fts = encoders.accum_edge_fts(encoder, dp, edge_fts)
+          node_fts = encoders.accum_node_fts(encoder, dp, node_fts)
+          graph_fts = encoders.accum_graph_fts(encoder, dp, graph_fts)
+        except Exception as e:
+          raise Exception(f'Failed to process {dp}') from e
+    # PROCESS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    nxt_hidden = hidden
+    for _ in range(self.nb_msg_passing_steps):
+      nxt_hidden, nxt_edge = self.processor(
+          node_fts,
+          edge_fts,
+          graph_fts,
+          adj_mat,
+          nxt_hidden,
+          batch_size=batch_size,
+          nb_nodes=nb_nodes,
+      )
+    if not repred:      # dropout only on training
+      nxt_hidden = hk.dropout(hk.next_rng_key(), self._dropout_prob, nxt_hidden)
+    if self.use_lstm:
+      # lstm doesn't accept multiple batch dimensions (in our case, batch and
+      # nodes), so we vmap over the (first) batch dimension.
+      nxt_hidden, nxt_lstm_state = jax.vmap(self.lstm)(nxt_hidden, lstm_state)
+    else:
+      nxt_lstm_state = None
+    h_t = jnp.concatenate([node_fts, hidden, nxt_hidden], axis=-1)
+    if nxt_edge is not None:
+      e_t = jnp.concatenate([edge_fts, nxt_edge], axis=-1)
+    else:
+      e_t = edge_fts
+    # DECODE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # Decode features and (optionally) hints.
+    hint_preds, output_preds = decoders.decode_fts(
+        decoders=decs,
+        spec=spec,
+        h_t=h_t,
+        adj_mat=adj_mat,
+        edge_fts=e_t,
+        graph_fts=graph_fts,
+        inf_bias=self.processor.inf_bias,
+        inf_bias_edge=self.processor.inf_bias_edge,
+        repred=repred,
+    )
+    return nxt_hidden, output_preds, hint_preds, nxt_lstm_state
+class NetChunked(Net):
+  """A Net that will process time-chunked data instead of full samples."""
+  def _msg_passing_step(self,
+                        mp_state: MessagePassingStateChunked,
+                        xs,
+                        repred: bool,
+                        init_mp_state: bool,
+                        batch_size: int,
+                        nb_nodes: int,
+                        spec: _Spec,
+                        encs: Dict[str, List[hk.Module]],
+                        decs: Dict[str, Tuple[hk.Module]],
+                        ):
+    """Perform one message passing step.
+    This function is unrolled along the time axis to process a data chunk.
+    Args:
+      mp_state: message-passing state. Includes the inputs, hints,
+        beginning-of-sample markers, hint predictions, hidden and lstm state
+        to be used for prediction in the current step.
+      xs: A 3-tuple of with the next timestep's inputs, hints, and
+        beginning-of-sample markers. These will replace the contents of
+        the `mp_state` at the output, in readiness for the next unroll step of
+        the chunk (or the first step of the next chunk). Besides, the next
+        timestep's hints are necessary to compute diffs when `decode_diffs`
+        is True.
+      repred: False during training, when we have access to ground-truth hints.
+        True in validation/test mode, when we have to use our own
+        hint predictions.
+      init_mp_state: Indicates if we are calling the method just to initialise
+        the message-passing state, before the beginning of training or
+        validation.
+      batch_size: Size of batch dimension.
+      nb_nodes: Number of nodes in graph.
+      spec: The spec of the algorithm being processed.
+      encs: encoders for the algorithm being processed.
+      decs: decoders for the algorithm being processed.
+    Returns:
+      A 2-tuple with the next mp_state and an output consisting of
+      hint predictions and output predictions.
+    """
+    def _as_prediction_data(hint):
+      if hint.type_ == _Type.POINTER:
+        return hk.one_hot(hint.data, nb_nodes)
+      return hint.data
+    nxt_inputs, nxt_hints, nxt_is_first = xs
+    inputs = mp_state.inputs
+    is_first = mp_state.is_first
+    hints = mp_state.hints
+    if init_mp_state:
+      prev_hint_preds = {h.name: _as_prediction_data(h) for h in hints}
+      hints_for_pred = hints
+    else:
+      prev_hint_preds = mp_state.hint_preds
+      if self.decode_hints:
+        if repred:
+          force_mask = jnp.zeros(batch_size, dtype=bool)
+        elif self._hint_teacher_forcing == 1.0:
+          force_mask = jnp.ones(batch_size, dtype=bool)
+        else:
+          force_mask = jax.random.bernoulli(
+              hk.next_rng_key(), self._hint_teacher_forcing,
+              (batch_size,))
+        assert self._hint_repred_mode in ['soft', 'hard', 'hard_on_eval']
+        hard_postprocess = (
+            self._hint_repred_mode == 'hard' or
+            (self._hint_repred_mode == 'hard_on_eval' and repred))
+        decoded_hints = decoders.postprocess(spec,
+                                             prev_hint_preds,
+                                             sinkhorn_temperature=0.1,
+                                             sinkhorn_steps=25,
+                                             hard=hard_postprocess)
+        hints_for_pred = []
+        for h in hints:
+          typ = h.type_
+          hint_data = h.data
+          if (typ == _Type.POINTER and
+              decoded_hints[h.name].type_ == _Type.SOFT_POINTER):
+            hint_data = hk.one_hot(hint_data, nb_nodes)
+            typ = _Type.SOFT_POINTER
+          hints_for_pred.append(probing.DataPoint(
+              name=h.name, location=h.location, type_=typ,
+              data=jnp.where(_expand_to(is_first | force_mask, hint_data),
+                             hint_data, decoded_hints[h.name].data)))
+      else:
+        hints_for_pred = hints
+    hiddens = jnp.where(is_first[..., None, None], 0.0, mp_state.hiddens)
+    if self.use_lstm:
+      lstm_state = jax.tree_util.tree_map(
+          lambda x: jnp.where(is_first[..., None, None], 0.0, x),
+          mp_state.lstm_state)
+    else:
+      lstm_state = None
+    hiddens, output_preds, hint_preds, lstm_state = self._one_step_pred(
+        inputs, hints_for_pred, hiddens,
+        batch_size, nb_nodes, lstm_state,
+        spec, encs, decs, repred)
+    new_mp_state = MessagePassingStateChunked(  # pytype: disable=wrong-arg-types  # numpy-scalars
+        hiddens=hiddens, lstm_state=lstm_state, hint_preds=hint_preds,
+        inputs=nxt_inputs, hints=nxt_hints, is_first=nxt_is_first)
+    mp_output = _MessagePassingOutputChunked(  # pytype: disable=wrong-arg-types  # numpy-scalars
+        hint_preds=hint_preds,
+        output_preds=output_preds)
+    return new_mp_state, mp_output
+  def __call__(self, features_list: List[_FeaturesChunked],
+               mp_state_list: List[MessagePassingStateChunked],
+               repred: bool, init_mp_state: bool,
+               algorithm_index: int):
+    """Process one chunk of data.
+    Args:
+      features_list: A list of _FeaturesChunked objects, each with the
+        inputs, hints and beginning- and end-of-sample markers for
+        a chunk (i.e., fixed time length) of data corresponding to one
+        algorithm. All features are expected
+        to have dimensions chunk_length x batch_size x ...
+        The list should have either length 1, at train/evaluation time,
+        or length equal to the number of algorithms this Net is meant to
+        process, at initialization.
+      mp_state_list: list of message-passing states. Each message-passing state
+        includes the inputs, hints, beginning-of-sample markers,
+        hint prediction, hidden and lstm state from the end of the previous
+        chunk, for one algorithm. The length of the list should be the same
+        as the length of `features_list`.
+      repred: False during training, when we have access to ground-truth hints.
+        True in validation/test mode, when we have to use our own hint
+        predictions.
+      init_mp_state: Indicates if we are calling the network just to initialise
+        the message-passing state, before the beginning of training or
+        validation. If True, `algorithm_index` (see below) must be -1 in order
+        to initialize the message-passing state of all algorithms.
+      algorithm_index: Which algorithm is being processed. It can be -1 at
+        initialisation (either because we are initialising the parameters of
+        the module or because we are intialising the message-passing state),
+        meaning that all algorithms should be processed, in which case
+        `features_list` and `mp_state_list` should have length equal to the
+        number of specs of the Net. Otherwise, `algorithm_index` should be
+        between 0 and `length(self.spec) - 1`, meaning only one of the
+        algorithms will be processed, and `features_list` and `mp_state_list`
+        should have length 1.
+    Returns:
+      A 2-tuple consisting of:
+      - A 2-tuple with (output predictions, hint predictions)
+        for the selected algorithm. Each of these has
+        chunk_length x batch_size x ... data, where the first time
+        slice contains outputs for the mp_state
+        that was passed as input, and the last time slice contains outputs
+        for the next-to-last slice of the input features. The outputs that
+        correspond to the final time slice of the input features will be
+        calculated when the next chunk is processed, using the data in the
+        mp_state returned here (see below). If `init_mp_state` is True,
+        we return None instead of the 2-tuple.
+      - The mp_state (message-passing state) for the next chunk of data
+        of the selected algorithm. If `init_mp_state` is True, we return
+        initial mp states for all the algorithms.
+    """
+    if algorithm_index == -1:
+      algorithm_indices = range(len(features_list))
+    else:
+      algorithm_indices = [algorithm_index]
+      assert not init_mp_state  # init state only allowed with all algorithms
+    assert len(algorithm_indices) == len(features_list)
+    assert len(algorithm_indices) == len(mp_state_list)
+    self.encoders, self.decoders = self._construct_encoders_decoders()
+    self.processor = self.processor_factory(self.hidden_dim)
+    # Optionally construct LSTM.
+    if self.use_lstm:
+      self.lstm = hk.LSTM(
+          hidden_size=self.hidden_dim,
+          name='processor_lstm')
+      lstm_init = self.lstm.initial_state
+    else:
+      self.lstm = None
+      lstm_init = lambda x: 0
+    if init_mp_state:
+      output_mp_states = []
+      for algorithm_index, features, mp_state in zip(
+          algorithm_indices, features_list, mp_state_list):
+        inputs = features.inputs
+        hints = features.hints
+        batch_size, nb_nodes = _data_dimensions_chunked(features)
+        if self.use_lstm:
+          lstm_state = lstm_init(batch_size * nb_nodes)
+          lstm_state = jax.tree_util.tree_map(
+              lambda x, b=batch_size, n=nb_nodes: jnp.reshape(x, [b, n, -1]),
+              lstm_state)
+          mp_state.lstm_state = lstm_state
+        mp_state.inputs = jax.tree_util.tree_map(lambda x: x[0], inputs)
+        mp_state.hints = jax.tree_util.tree_map(lambda x: x[0], hints)
+        mp_state.is_first = jnp.zeros(batch_size, dtype=int)
+        mp_state.hiddens = jnp.zeros((batch_size, nb_nodes, self.hidden_dim))
+        next_is_first = jnp.ones(batch_size, dtype=int)
+        mp_state, _ = self._msg_passing_step(
+            mp_state,
+            (mp_state.inputs, mp_state.hints, next_is_first),
+            repred=repred,
+            init_mp_state=True,
+            batch_size=batch_size,
+            nb_nodes=nb_nodes,
+            spec=self.spec[algorithm_index],
+            encs=self.encoders[algorithm_index],
+            decs=self.decoders[algorithm_index],
+            )
+        output_mp_states.append(mp_state)
+      return None, output_mp_states
+    for algorithm_index, features, mp_state in zip(
+        algorithm_indices, features_list, mp_state_list):
+      inputs = features.inputs
+      hints = features.hints
+      is_first = features.is_first
+      batch_size, nb_nodes = _data_dimensions_chunked(features)
+      scan_fn = functools.partial(
+          self._msg_passing_step,
+          repred=repred,
+          init_mp_state=False,
+          batch_size=batch_size,
+          nb_nodes=nb_nodes,
+          spec=self.spec[algorithm_index],
+          encs=self.encoders[algorithm_index],
+          decs=self.decoders[algorithm_index],
+          )
+      mp_state, scan_output = hk.scan(
+          scan_fn,
+          mp_state,
+          (inputs, hints, is_first),
+      )
+    # We only return the last algorithm's output and state. That's because
+    # the output only matters when a single algorithm is processed; the case
+    # `algorithm_index==-1` (meaning all algorithms should be processed)
+    # is used only to init parameters.
+    return (scan_output.output_preds, scan_output.hint_preds), mp_state
+def _data_dimensions(features: _Features) -> Tuple[int, int]:
+  """Returns (batch_size, nb_nodes)."""
+  for inp in features.inputs:
+    if inp.location in [_Location.NODE, _Location.EDGE]:
+      return inp.data.shape[:2]
+  assert False
+def _data_dimensions_chunked(features: _FeaturesChunked) -> Tuple[int, int]:
+  """Returns (batch_size, nb_nodes)."""
+  for inp in features.inputs:
+    if inp.location in [_Location.NODE, _Location.EDGE]:
+      return inp.data.shape[1:3]
+  assert False
+def _expand_to(x: _Array, y: _Array) -> _Array:
+  while len(y.shape) > len(x.shape):
+    x = jnp.expand_dims(x, -1)
+  return x
+def _is_not_done_broadcast(lengths, i, tensor):
+  is_not_done = (lengths > i + 1) * 1.0
+  while len(is_not_done.shape) < len(tensor.shape):  # pytype: disable=attribute-error  # numpy-scalars
+    is_not_done = jnp.expand_dims(is_not_done, -1)
+  return is_not_done

benchmarks/CLRS/env/probing.py ADDED Viewed

	@@ -0,0 +1,351 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Probing utilities.
+The dataflow for an algorithm is represented by `(stage, loc, type, data)`
+"probes" that are valid under that algorithm's spec (see `specs.py`).
+When constructing probes, it is convenient to represent these fields in a nested
+format (`ProbesDict`) to facilate efficient contest-based look-up.
+"""
+import functools
+from typing import Dict, List, Tuple, Union
+import attr
+from clrs._src import specs
+import jax
+import jax.numpy as jnp
+import numpy as np
+import tensorflow as tf
+_Location = specs.Location
+_Stage = specs.Stage
+_Type = specs.Type
+_OutputClass = specs.OutputClass
+_Array = np.ndarray
+_Data = Union[_Array, List[_Array]]
+_DataOrType = Union[_Data, str]
+ProbesDict = Dict[
+    str, Dict[str, Dict[str, Dict[str, _DataOrType]]]]
+def _convert_to_str(element):
+  if isinstance(element, tf.Tensor):
+    return element.numpy().decode('utf-8')
+  elif isinstance(element, (np.ndarray, bytes)):
+    return element.decode('utf-8')
+  else:
+    return element
+# First anotation makes this object jax.jit/pmap friendly, second one makes this
+# tf.data.Datasets friendly.
+@jax.tree_util.register_pytree_node_class
+@attr.define
+class DataPoint:
+  """Describes a data point."""
+  _name: str
+  _location: str
+  _type_: str
+  data: _Array
+  @property
+  def name(self):
+    return _convert_to_str(self._name)
+  @property
+  def location(self):
+    return _convert_to_str(self._location)
+  @property
+  def type_(self):
+    return _convert_to_str(self._type_)
+  def __repr__(self):
+    s = f'DataPoint(name="{self.name}",\tlocation={self.location},\t'
+    return s + f'type={self.type_},\tdata=Array{self.data.shape})'
+  def tree_flatten(self):
+    data = (self.data,)
+    meta = (self.name, self.location, self.type_)
+    return data, meta
+  @classmethod
+  def tree_unflatten(cls, meta, data):
+    name, location, type_ = meta
+    subdata, = data
+    return DataPoint(name, location, type_, subdata)
+class ProbeError(Exception):
+  pass
+def initialize(spec: specs.Spec) -> ProbesDict:
+  """Initializes an empty `ProbesDict` corresponding with the provided spec."""
+  probes = dict()
+  for stage in [_Stage.INPUT, _Stage.OUTPUT, _Stage.HINT]:
+    probes[stage] = {}
+    for loc in [_Location.NODE, _Location.EDGE, _Location.GRAPH]:
+      probes[stage][loc] = {}
+  for name in spec:
+    stage, loc, t = spec[name]
+    probes[stage][loc][name] = {}
+    probes[stage][loc][name]['data'] = []
+    probes[stage][loc][name]['type_'] = t
+  # Pytype thinks initialize() returns a ProbesDict with a str for all final
+  # values instead of _DataOrType.
+  return probes  # pytype: disable=bad-return-type
+def push(probes: ProbesDict, stage: str, next_probe):
+  """Pushes a probe into an existing `ProbesDict`."""
+  for loc in [_Location.NODE, _Location.EDGE, _Location.GRAPH]:
+    for name in probes[stage][loc]:
+      if name not in next_probe:
+        raise ProbeError(f'Missing probe for {name}.')
+      if isinstance(probes[stage][loc][name]['data'], _Array):
+        raise ProbeError('Attemping to push to finalized `ProbesDict`.')
+      # Pytype thinks initialize() returns a ProbesDict with a str for all final
+      # values instead of _DataOrType.
+      probes[stage][loc][name]['data'].append(next_probe[name])  # pytype: disable=attribute-error
+def finalize(probes: ProbesDict):
+  """Finalizes a `ProbesDict` by stacking/squeezing `data` field."""
+  for stage in [_Stage.INPUT, _Stage.OUTPUT, _Stage.HINT]:
+    for loc in [_Location.NODE, _Location.EDGE, _Location.GRAPH]:
+      for name in probes[stage][loc]:
+        if isinstance(probes[stage][loc][name]['data'], _Array):
+          raise ProbeError('Attemping to re-finalize a finalized `ProbesDict`.')
+        if stage == _Stage.HINT:
+          # Hints are provided for each timestep. Stack them here.
+          probes[stage][loc][name]['data'] = np.stack(
+              probes[stage][loc][name]['data'])
+        else:
+          # Only one instance of input/output exist. Remove leading axis.
+          probes[stage][loc][name]['data'] = np.squeeze(
+              np.array(probes[stage][loc][name]['data']))
+def split_stages(
+    probes: ProbesDict,
+    spec: specs.Spec,
+) -> Tuple[List[DataPoint], List[DataPoint], List[DataPoint]]:
+  """Splits contents of `ProbesDict` into `DataPoint`s by stage."""
+  inputs = []
+  outputs = []
+  hints = []
+  for name in spec:
+    stage, loc, t = spec[name]
+    if stage not in probes:
+      raise ProbeError(f'Missing stage {stage}.')
+    if loc not in probes[stage]:
+      raise ProbeError(f'Missing location {loc}.')
+    if name not in probes[stage][loc]:
+      raise ProbeError(f'Missing probe {name}.')
+    if 'type_' not in probes[stage][loc][name]:
+      raise ProbeError(f'Probe {name} missing attribute `type_`.')
+    if 'data' not in probes[stage][loc][name]:
+      raise ProbeError(f'Probe {name} missing attribute `data`.')
+    if t != probes[stage][loc][name]['type_']:
+      raise ProbeError(f'Probe {name} of incorrect type {t}.')
+    data = probes[stage][loc][name]['data']
+    if not isinstance(probes[stage][loc][name]['data'], _Array):
+      raise ProbeError((f'Invalid `data` for probe "{name}". ' +
+                        'Did you forget to call `probing.finalize`?'))
+    if t in [_Type.MASK, _Type.MASK_ONE, _Type.CATEGORICAL]:
+      # pytype: disable=attribute-error
+      if not ((data == 0) | (data == 1) | (data == -1)).all():
+        raise ProbeError(f'0|1|-1 `data` for probe "{name}"')
+      # pytype: enable=attribute-error
+      if t in [_Type.MASK_ONE, _Type.CATEGORICAL
+              ] and not np.all(np.sum(np.abs(data), -1) == 1):
+        raise ProbeError(f'Expected one-hot `data` for probe "{name}"')
+    dim_to_expand = 1 if stage == _Stage.HINT else 0
+    data_point = DataPoint(name=name, location=loc, type_=t,
+                           data=np.expand_dims(data, dim_to_expand))
+    if stage == _Stage.INPUT:
+      inputs.append(data_point)
+    elif stage == _Stage.OUTPUT:
+      outputs.append(data_point)
+    else:
+      hints.append(data_point)
+  return inputs, outputs, hints
+# pylint: disable=invalid-name
+def array(A_pos: np.ndarray) -> np.ndarray:
+  """Constructs an `array` probe."""
+  probe = np.arange(A_pos.shape[0])
+  for i in range(1, A_pos.shape[0]):
+    probe[A_pos[i]] = A_pos[i - 1]
+  return probe
+def array_cat(A: np.ndarray, n: int) -> np.ndarray:
+  """Constructs an `array_cat` probe."""
+  assert n > 0
+  probe = np.zeros((A.shape[0], n))
+  for i in range(A.shape[0]):
+    probe[i, A[i]] = 1
+  return probe
+def heap(A_pos: np.ndarray, heap_size: int) -> np.ndarray:
+  """Constructs a `heap` probe."""
+  assert heap_size > 0
+  probe = np.arange(A_pos.shape[0])
+  for i in range(1, heap_size):
+    probe[A_pos[i]] = A_pos[(i - 1) // 2]
+  return probe
+def graph(A: np.ndarray) -> np.ndarray:
+  """Constructs a `graph` probe."""
+  probe = (A != 0) * 1.0
+  probe = ((A + np.eye(A.shape[0])) != 0) * 1.0
+  return probe
+def mask_one(i: int, n: int) -> np.ndarray:
+  """Constructs a `mask_one` probe."""
+  assert n > i
+  probe = np.zeros(n)
+  probe[i] = 1
+  return probe
+def strings_id(T_pos: np.ndarray, P_pos: np.ndarray) -> np.ndarray:
+  """Constructs a `strings_id` probe."""
+  probe_T = np.zeros(T_pos.shape[0])
+  probe_P = np.ones(P_pos.shape[0])
+  return np.concatenate([probe_T, probe_P])
+def strings_pair(pair_probe: np.ndarray) -> np.ndarray:
+  """Constructs a `strings_pair` probe."""
+  n = pair_probe.shape[0]
+  m = pair_probe.shape[1]
+  probe_ret = np.zeros((n + m, n + m))
+  for i in range(0, n):
+    for j in range(0, m):
+      probe_ret[i, j + n] = pair_probe[i, j]
+  return probe_ret
+def strings_pair_cat(pair_probe: np.ndarray, nb_classes: int) -> np.ndarray:
+  """Constructs a `strings_pair_cat` probe."""
+  assert nb_classes > 0
+  n = pair_probe.shape[0]
+  m = pair_probe.shape[1]
+  # Add an extra class for 'this cell left blank.'
+  probe_ret = np.zeros((n + m, n + m, nb_classes + 1))
+  for i in range(0, n):
+    for j in range(0, m):
+      probe_ret[i, j + n, int(pair_probe[i, j])] = _OutputClass.POSITIVE
+  # Fill the blank cells.
+  for i_1 in range(0, n):
+    for i_2 in range(0, n):
+      probe_ret[i_1, i_2, nb_classes] = _OutputClass.MASKED
+  for j_1 in range(0, m):
+    for x in range(0, n + m):
+      probe_ret[j_1 + n, x, nb_classes] = _OutputClass.MASKED
+  return probe_ret
+def strings_pi(T_pos: np.ndarray, P_pos: np.ndarray,
+               pi: np.ndarray) -> np.ndarray:
+  """Constructs a `strings_pi` probe."""
+  probe = np.arange(T_pos.shape[0] + P_pos.shape[0])
+  for j in range(P_pos.shape[0]):
+    probe[T_pos.shape[0] + P_pos[j]] = T_pos.shape[0] + pi[P_pos[j]]
+  return probe
+def strings_pos(T_pos: np.ndarray, P_pos: np.ndarray) -> np.ndarray:
+  """Constructs a `strings_pos` probe."""
+  probe_T = np.copy(T_pos) * 1.0 / T_pos.shape[0]
+  probe_P = np.copy(P_pos) * 1.0 / P_pos.shape[0]
+  return np.concatenate([probe_T, probe_P])
+def strings_pred(T_pos: np.ndarray, P_pos: np.ndarray) -> np.ndarray:
+  """Constructs a `strings_pred` probe."""
+  probe = np.arange(T_pos.shape[0] + P_pos.shape[0])
+  for i in range(1, T_pos.shape[0]):
+    probe[T_pos[i]] = T_pos[i - 1]
+  for j in range(1, P_pos.shape[0]):
+    probe[T_pos.shape[0] + P_pos[j]] = T_pos.shape[0] + P_pos[j - 1]
+  return probe
+@functools.partial(jnp.vectorize, signature='(n)->(n,n),(n)')
+def predecessor_to_cyclic_predecessor_and_first(
+    pointers: jnp.ndarray) -> Tuple[jnp.ndarray, jnp.ndarray]:
+  """Converts predecessor pointers to cyclic predecessor + first node mask.
+  This function assumes that the pointers represent a linear order of the nodes
+  (akin to a linked list), where each node points to its predecessor and the
+  first node points to itself. It returns the same pointers, except that
+  the first node points to the last, and a mask_one marking the first node.
+  Example:
+  ```
+  pointers = [2, 1, 1]
+  P = [[0, 0, 1],
+       [1, 0, 0],
+       [0, 1, 0]],
+  M = [0, 1, 0]
+  ```
+  Args:
+    pointers: array of shape [N] containing pointers. The pointers are assumed
+      to describe a linear order such that `pointers[i]` is the predecessor
+      of node `i`.
+  Returns:
+    Permutation pointers `P` of shape [N] and one-hot vector `M` of shape [N].
+  """
+  nb_nodes = pointers.shape[-1]
+  pointers_one_hot = jax.nn.one_hot(pointers, nb_nodes)
+  # Find the index of the last node: it's the node that no other node points to.
+  last = pointers_one_hot.sum(-2).argmin()
+  # Find the first node: should be the only one pointing to itself.
+  first = pointers_one_hot.diagonal().argmax()
+  mask = jax.nn.one_hot(first, nb_nodes)
+  pointers_one_hot += mask[..., None] * jax.nn.one_hot(last, nb_nodes)
+  pointers_one_hot -= mask[..., None] * mask
+  return pointers_one_hot, mask

benchmarks/CLRS/env/probing_test.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for `probing.py`."""
+from absl.testing import absltest
+from clrs._src import probing
+import jax.numpy as jnp
+import numpy as np
+# pylint: disable=invalid-name
+class ProbingTest(absltest.TestCase):
+  def test_array(self):
+    A_pos = np.array([1, 2, 0, 4, 3])
+    expected = np.array([2, 1, 1, 4, 0])
+    out = probing.array(A_pos)
+    np.testing.assert_array_equal(expected, out)
+  def test_array_cat(self):
+    A = np.array([2, 1, 0, 1, 1])
+    expected = np.array([
+        [0, 0, 1],
+        [0, 1, 0],
+        [1, 0, 0],
+        [0, 1, 0],
+        [0, 1, 0]
+    ])
+    out = probing.array_cat(A, 3)
+    np.testing.assert_array_equal(expected, out)
+  def test_heap(self):
+    A_pos = np.array([1, 3, 5, 0, 7, 4, 2, 6])
+    expected = np.array([3, 1, 2, 1, 5, 1, 6, 3])
+    out = probing.heap(A_pos, heap_size=6)
+    np.testing.assert_array_equal(expected, out)
+  def test_graph(self):
+    G = np.array([
+        [0.0, 7.0, -1.0, -3.9, 7.452],
+        [0.0, 0.0, 133.0, 0.0, 9.3],
+        [0.5, 0.1, 0.22, 0.55, 0.666],
+        [7.0, 6.1, 0.2, 0.0, 0.0],
+        [0.0, 3.0, 0.0, 1.0, 0.5]
+    ])
+    expected = np.array([
+        [1.0, 1.0, 1.0, 1.0, 1.0],
+        [0.0, 1.0, 1.0, 0.0, 1.0],
+        [1.0, 1.0, 1.0, 1.0, 1.0],
+        [1.0, 1.0, 1.0, 1.0, 0.0],
+        [0.0, 1.0, 0.0, 1.0, 1.0]
+    ])
+    out = probing.graph(G)
+    np.testing.assert_array_equal(expected, out)
+  def test_mask_one(self):
+    expected = np.array([0, 0, 0, 1, 0])
+    out = probing.mask_one(3, 5)
+    np.testing.assert_array_equal(expected, out)
+  def test_strings_id(self):
+    T_pos = np.array([0, 1, 2, 3, 4])
+    P_pos = np.array([0, 1, 2])
+    expected = np.array([0, 0, 0, 0, 0, 1, 1, 1])
+    out = probing.strings_id(T_pos, P_pos)
+    np.testing.assert_array_equal(expected, out)
+  def test_strings_pair(self):
+    pair_probe = np.array([
+        [0.5, 3.1, 9.1, 7.3],
+        [1.0, 0.0, 8.0, 9.3],
+        [0.1, 5.0, 0.0, 1.2]
+    ])
+    expected = np.array([
+        [0.0, 0.0, 0.0, 0.5, 3.1, 9.1, 7.3],
+        [0.0, 0.0, 0.0, 1.0, 0.0, 8.0, 9.3],
+        [0.0, 0.0, 0.0, 0.1, 5.0, 0.0, 1.2],
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    ])
+    out = probing.strings_pair(pair_probe)
+    np.testing.assert_equal(expected, out)
+  def test_strings_pair_cat(self):
+    pair_probe = np.array([
+        [0, 2, 1],
+        [2, 2, 0]
+    ])
+    expected = np.array([
+        [
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [1, 0, 0, 0],
+            [0, 0, 1, 0],
+            [0, 1, 0, 0],
+        ],
+        [
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 1, 0],
+            [0, 0, 1, 0],
+            [1, 0, 0, 0],
+        ],
+        [
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+        ],
+        [
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+        ],
+        [
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+            [0, 0, 0, -1],
+        ],
+    ])
+    out = probing.strings_pair_cat(pair_probe, 3)
+    np.testing.assert_equal(expected, out)
+  def test_strings_pi(self):
+    T_pos = np.array([0, 1, 2, 3, 4, 5])
+    P_pos = np.array([0, 1, 2, 3])
+    pi = np.array([3, 1, 0, 2])
+    expected = np.array(
+        [0, 1, 2, 3, 4, 5, 9, 7, 6, 8]
+    )
+    out = probing.strings_pi(T_pos, P_pos, pi)
+    np.testing.assert_array_equal(expected, out)
+  def test_strings_pos(self):
+    T_pos = np.array([0, 1, 2, 3, 4])
+    P_pos = np.array([0, 1, 2, 3])
+    expected = np.array(
+        [0.0, 0.2, 0.4, 0.6, 0.8,
+         0.0, 0.25, 0.5, 0.75]
+    )
+    out = probing.strings_pos(T_pos, P_pos)
+    np.testing.assert_array_equal(expected, out)
+  def test_strings_pred(self):
+    T_pos = np.array([0, 1, 2, 3, 4])
+    P_pos = np.array([0, 1, 2])
+    expected = np.array([0, 0, 1, 2, 3, 5, 5, 6])
+    out = probing.strings_pred(T_pos, P_pos)
+    np.testing.assert_array_equal(expected, out)
+class PermutationsTest(absltest.TestCase):
+  def test_pointers_to_permutation(self):
+    pointers = jnp.array([2, 1, 1])
+    perm, first = probing.predecessor_to_cyclic_predecessor_and_first(pointers)
+    np.testing.assert_array_equal(
+        perm, np.array([[0, 0, 1], [1, 0, 0], [0, 1, 0]]))
+    np.testing.assert_array_equal(first, np.array([0, 1, 0]))
+  def test_pointers_to_permutation_already_sorted(self):
+    pointers = jnp.array([0, 0, 1, 2, 3, 4])
+    perm, first = probing.predecessor_to_cyclic_predecessor_and_first(pointers)
+    np.testing.assert_array_equal(perm, np.roll(np.eye(6), 1, 0))
+    np.testing.assert_array_equal(first, np.eye(6)[0])
+if __name__ == "__main__":
+  absltest.main()

benchmarks/CLRS/env/processors.py ADDED Viewed

	@@ -0,0 +1,856 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""JAX implementation of baseline processor networks."""
+import abc
+from typing import Any, Callable, List, Optional, Tuple
+import chex
+import haiku as hk
+import jax
+import jax.numpy as jnp
+import numpy as np
+_Array = chex.Array
+_Fn = Callable[..., Any]
+BIG_NUMBER = 1e6
+PROCESSOR_TAG = 'clrs_processor'
+class Processor(hk.Module):
+  """Processor abstract base class."""
+  def __init__(self, name: str):
+    if not name.endswith(PROCESSOR_TAG):
+      name = name + '_' + PROCESSOR_TAG
+    super().__init__(name=name)
+  @abc.abstractmethod
+  def __call__(
+      self,
+      node_fts: _Array,
+      edge_fts: _Array,
+      graph_fts: _Array,
+      adj_mat: _Array,
+      hidden: _Array,
+      **kwargs,
+  ) -> Tuple[_Array, Optional[_Array]]:
+    """Processor inference step.
+    Args:
+      node_fts: Node features.
+      edge_fts: Edge features.
+      graph_fts: Graph features.
+      adj_mat: Graph adjacency matrix.
+      hidden: Hidden features.
+      **kwargs: Extra kwargs.
+    Returns:
+      Output of processor inference step as a 2-tuple of (node, edge)
+      embeddings. The edge embeddings can be None.
+    """
+    pass
+  @property
+  def inf_bias(self):
+    return False
+  @property
+  def inf_bias_edge(self):
+    return False
+class GAT(Processor):
+  """Graph Attention Network (Velickovic et al., ICLR 2018)."""
+  def __init__(
+      self,
+      out_size: int,
+      nb_heads: int,
+      activation: Optional[_Fn] = jax.nn.relu,
+      residual: bool = True,
+      use_ln: bool = False,
+      name: str = 'gat_aggr',
+  ):
+    super().__init__(name=name)
+    self.out_size = out_size
+    self.nb_heads = nb_heads
+    if out_size % nb_heads != 0:
+      raise ValueError('The number of attention heads must divide the width!')
+    self.head_size = out_size // nb_heads
+    self.activation = activation
+    self.residual = residual
+    self.use_ln = use_ln
+  def __call__(  # pytype: disable=signature-mismatch  # numpy-scalars
+      self,
+      node_fts: _Array,
+      edge_fts: _Array,
+      graph_fts: _Array,
+      adj_mat: _Array,
+      hidden: _Array,
+      **unused_kwargs,
+  ) -> _Array:
+    """GAT inference step."""
+    b, n, _ = node_fts.shape
+    assert edge_fts.shape[:-1] == (b, n, n)
+    assert graph_fts.shape[:-1] == (b,)
+    assert adj_mat.shape == (b, n, n)
+    z = jnp.concatenate([node_fts, hidden], axis=-1)
+    m = hk.Linear(self.out_size)
+    skip = hk.Linear(self.out_size)
+    bias_mat = (adj_mat - 1.0) * 1e9
+    bias_mat = jnp.tile(bias_mat[..., None],
+                        (1, 1, 1, self.nb_heads))     # [B, N, N, H]
+    bias_mat = jnp.transpose(bias_mat, (0, 3, 1, 2))  # [B, H, N, N]
+    a_1 = hk.Linear(self.nb_heads)
+    a_2 = hk.Linear(self.nb_heads)
+    a_e = hk.Linear(self.nb_heads)
+    a_g = hk.Linear(self.nb_heads)
+    values = m(z)                                      # [B, N, H*F]
+    values = jnp.reshape(
+        values,
+        values.shape[:-1] + (self.nb_heads, self.head_size))  # [B, N, H, F]
+    values = jnp.transpose(values, (0, 2, 1, 3))              # [B, H, N, F]
+    att_1 = jnp.expand_dims(a_1(z), axis=-1)
+    att_2 = jnp.expand_dims(a_2(z), axis=-1)
+    att_e = a_e(edge_fts)
+    att_g = jnp.expand_dims(a_g(graph_fts), axis=-1)
+    logits = (
+        jnp.transpose(att_1, (0, 2, 1, 3)) +  # + [B, H, N, 1]
+        jnp.transpose(att_2, (0, 2, 3, 1)) +  # + [B, H, 1, N]
+        jnp.transpose(att_e, (0, 3, 1, 2)) +  # + [B, H, N, N]
+        jnp.expand_dims(att_g, axis=-1)       # + [B, H, 1, 1]
+    )                                         # = [B, H, N, N]
+    coefs = jax.nn.softmax(jax.nn.leaky_relu(logits) + bias_mat, axis=-1)
+    ret = jnp.matmul(coefs, values)  # [B, H, N, F]
+    ret = jnp.transpose(ret, (0, 2, 1, 3))  # [B, N, H, F]
+    ret = jnp.reshape(ret, ret.shape[:-2] + (self.out_size,))  # [B, N, H*F]
+    if self.residual:
+      ret += skip(z)
+    if self.activation is not None:
+      ret = self.activation(ret)
+    if self.use_ln:
+      ln = hk.LayerNorm(axis=-1, create_scale=True, create_offset=True)
+      ret = ln(ret)
+    return ret, None  # pytype: disable=bad-return-type  # numpy-scalars
+class GATFull(GAT):
+  """Graph Attention Network with full adjacency matrix."""
+  def __call__(self, node_fts: _Array, edge_fts: _Array, graph_fts: _Array,
+               adj_mat: _Array, hidden: _Array, **unused_kwargs) -> _Array:
+    adj_mat = jnp.ones_like(adj_mat)
+    return super().__call__(node_fts, edge_fts, graph_fts, adj_mat, hidden)
+class GATv2(Processor):
+  """Graph Attention Network v2 (Brody et al., ICLR 2022)."""
+  def __init__(
+      self,
+      out_size: int,
+      nb_heads: int,
+      mid_size: Optional[int] = None,
+      activation: Optional[_Fn] = jax.nn.relu,
+      residual: bool = True,
+      use_ln: bool = False,
+      name: str = 'gatv2_aggr',
+  ):
+    super().__init__(name=name)
+    if mid_size is None:
+      self.mid_size = out_size
+    else:
+      self.mid_size = mid_size
+    self.out_size = out_size
+    self.nb_heads = nb_heads
+    if out_size % nb_heads != 0:
+      raise ValueError('The number of attention heads must divide the width!')
+    self.head_size = out_size // nb_heads
+    if self.mid_size % nb_heads != 0:
+      raise ValueError('The number of attention heads must divide the message!')
+    self.mid_head_size = self.mid_size // nb_heads
+    self.activation = activation
+    self.residual = residual
+    self.use_ln = use_ln
+  def __call__(  # pytype: disable=signature-mismatch  # numpy-scalars
+      self,
+      node_fts: _Array,
+      edge_fts: _Array,
+      graph_fts: _Array,
+      adj_mat: _Array,
+      hidden: _Array,
+      **unused_kwargs,
+  ) -> _Array:
+    """GATv2 inference step."""
+    b, n, _ = node_fts.shape
+    assert edge_fts.shape[:-1] == (b, n, n)
+    assert graph_fts.shape[:-1] == (b,)
+    assert adj_mat.shape == (b, n, n)
+    z = jnp.concatenate([node_fts, hidden], axis=-1)
+    m = hk.Linear(self.out_size)
+    skip = hk.Linear(self.out_size)
+    bias_mat = (adj_mat - 1.0) * 1e9
+    bias_mat = jnp.tile(bias_mat[..., None],
+                        (1, 1, 1, self.nb_heads))     # [B, N, N, H]
+    bias_mat = jnp.transpose(bias_mat, (0, 3, 1, 2))  # [B, H, N, N]
+    w_1 = hk.Linear(self.mid_size)
+    w_2 = hk.Linear(self.mid_size)
+    w_e = hk.Linear(self.mid_size)
+    w_g = hk.Linear(self.mid_size)
+    a_heads = []
+    for _ in range(self.nb_heads):
+      a_heads.append(hk.Linear(1))
+    values = m(z)                                      # [B, N, H*F]
+    values = jnp.reshape(
+        values,
+        values.shape[:-1] + (self.nb_heads, self.head_size))  # [B, N, H, F]
+    values = jnp.transpose(values, (0, 2, 1, 3))              # [B, H, N, F]
+    pre_att_1 = w_1(z)
+    pre_att_2 = w_2(z)
+    pre_att_e = w_e(edge_fts)
+    pre_att_g = w_g(graph_fts)
+    pre_att = (
+        jnp.expand_dims(pre_att_1, axis=1) +     # + [B, 1, N, H*F]
+        jnp.expand_dims(pre_att_2, axis=2) +     # + [B, N, 1, H*F]
+        pre_att_e +                              # + [B, N, N, H*F]
+        jnp.expand_dims(pre_att_g, axis=(1, 2))  # + [B, 1, 1, H*F]
+    )                                            # = [B, N, N, H*F]
+    pre_att = jnp.reshape(
+        pre_att,
+        pre_att.shape[:-1] + (self.nb_heads, self.mid_head_size)
+    )  # [B, N, N, H, F]
+    pre_att = jnp.transpose(pre_att, (0, 3, 1, 2, 4))  # [B, H, N, N, F]
+    # This part is not very efficient, but we agree to keep it this way to
+    # enhance readability, assuming `nb_heads` will not be large.
+    logit_heads = []
+    for head in range(self.nb_heads):
+      logit_heads.append(
+          jnp.squeeze(
+              a_heads[head](jax.nn.leaky_relu(pre_att[:, head])),
+              axis=-1)
+      )  # [B, N, N]
+    logits = jnp.stack(logit_heads, axis=1)  # [B, H, N, N]
+    coefs = jax.nn.softmax(logits + bias_mat, axis=-1)
+    ret = jnp.matmul(coefs, values)  # [B, H, N, F]
+    ret = jnp.transpose(ret, (0, 2, 1, 3))  # [B, N, H, F]
+    ret = jnp.reshape(ret, ret.shape[:-2] + (self.out_size,))  # [B, N, H*F]
+    if self.residual:
+      ret += skip(z)
+    if self.activation is not None:
+      ret = self.activation(ret)
+    if self.use_ln:
+      ln = hk.LayerNorm(axis=-1, create_scale=True, create_offset=True)
+      ret = ln(ret)
+    return ret, None  # pytype: disable=bad-return-type  # numpy-scalars
+class GATv2Full(GATv2):
+  """Graph Attention Network v2 with full adjacency matrix."""
+  def __call__(self, node_fts: _Array, edge_fts: _Array, graph_fts: _Array,
+               adj_mat: _Array, hidden: _Array, **unused_kwargs) -> _Array:
+    adj_mat = jnp.ones_like(adj_mat)
+    return super().__call__(node_fts, edge_fts, graph_fts, adj_mat, hidden)
+def get_triplet_msgs(z, edge_fts, graph_fts, nb_triplet_fts):
+  """Triplet messages, as done by Dudzik and Velickovic (2022)."""
+  t_1 = hk.Linear(nb_triplet_fts)
+  t_2 = hk.Linear(nb_triplet_fts)
+  t_3 = hk.Linear(nb_triplet_fts)
+  t_e_1 = hk.Linear(nb_triplet_fts)
+  t_e_2 = hk.Linear(nb_triplet_fts)
+  t_e_3 = hk.Linear(nb_triplet_fts)
+  t_g = hk.Linear(nb_triplet_fts)
+  tri_1 = t_1(z)
+  tri_2 = t_2(z)
+  tri_3 = t_3(z)
+  tri_e_1 = t_e_1(edge_fts)
+  tri_e_2 = t_e_2(edge_fts)
+  tri_e_3 = t_e_3(edge_fts)
+  tri_g = t_g(graph_fts)
+  return (
+      jnp.expand_dims(tri_1, axis=(2, 3))    +  #   (B, N, 1, 1, H)
+      jnp.expand_dims(tri_2, axis=(1, 3))    +  # + (B, 1, N, 1, H)
+      jnp.expand_dims(tri_3, axis=(1, 2))    +  # + (B, 1, 1, N, H)
+      jnp.expand_dims(tri_e_1, axis=3)       +  # + (B, N, N, 1, H)
+      jnp.expand_dims(tri_e_2, axis=2)       +  # + (B, N, 1, N, H)
+      jnp.expand_dims(tri_e_3, axis=1)       +  # + (B, 1, N, N, H)
+      jnp.expand_dims(tri_g, axis=(1, 2, 3))    # + (B, 1, 1, 1, H)
+  )                                             # = (B, N, N, N, H)
+class PGN(Processor):
+  """Pointer Graph Networks (Veličković et al., NeurIPS 2020)."""
+  def __init__(
+      self,
+      out_size: int,
+      mid_size: Optional[int] = None,
+      mid_act: Optional[_Fn] = None,
+      activation: Optional[_Fn] = jax.nn.relu,
+      reduction: _Fn = jnp.max,
+      msgs_mlp_sizes: Optional[List[int]] = None,
+      use_ln: bool = False,
+      use_triplets: bool = False,
+      nb_triplet_fts: int = 8,
+      gated: bool = False,
+      name: str = 'mpnn_aggr',
+  ):
+    super().__init__(name=name)
+    if mid_size is None:
+      self.mid_size = out_size
+    else:
+      self.mid_size = mid_size
+    self.out_size = out_size
+    self.mid_act = mid_act
+    self.activation = activation
+    self.reduction = reduction
+    self._msgs_mlp_sizes = msgs_mlp_sizes
+    self.use_ln = use_ln
+    self.use_triplets = use_triplets
+    self.nb_triplet_fts = nb_triplet_fts
+    self.gated = gated
+  def __call__(  # pytype: disable=signature-mismatch  # numpy-scalars
+      self,
+      node_fts: _Array,
+      edge_fts: _Array,
+      graph_fts: _Array,
+      adj_mat: _Array,
+      hidden: _Array,
+      **unused_kwargs,
+  ) -> _Array:
+    """MPNN inference step."""
+    b, n, _ = node_fts.shape
+    assert edge_fts.shape[:-1] == (b, n, n)
+    assert graph_fts.shape[:-1] == (b,)
+    assert adj_mat.shape == (b, n, n)
+    z = jnp.concatenate([node_fts, hidden], axis=-1)
+    m_1 = hk.Linear(self.mid_size)
+    m_2 = hk.Linear(self.mid_size)
+    m_e = hk.Linear(self.mid_size)
+    m_g = hk.Linear(self.mid_size)
+    o1 = hk.Linear(self.out_size)
+    o2 = hk.Linear(self.out_size)
+    msg_1 = m_1(z)
+    msg_2 = m_2(z)
+    msg_e = m_e(edge_fts)
+    msg_g = m_g(graph_fts)
+    tri_msgs = None
+    if self.use_triplets:
+      # Triplet messages, as done by Dudzik and Velickovic (2022)
+      triplets = get_triplet_msgs(z, edge_fts, graph_fts, self.nb_triplet_fts)
+      o3 = hk.Linear(self.out_size)
+      tri_msgs = o3(jnp.max(triplets, axis=1))  # (B, N, N, H)
+      if self.activation is not None:
+        tri_msgs = self.activation(tri_msgs)
+    msgs = (
+        jnp.expand_dims(msg_1, axis=1) + jnp.expand_dims(msg_2, axis=2) +
+        msg_e + jnp.expand_dims(msg_g, axis=(1, 2)))
+    if self._msgs_mlp_sizes is not None:
+      msgs = hk.nets.MLP(self._msgs_mlp_sizes)(jax.nn.relu(msgs))
+    if self.mid_act is not None:
+      msgs = self.mid_act(msgs)
+    if self.reduction == jnp.mean:
+      msgs = jnp.sum(msgs * jnp.expand_dims(adj_mat, -1), axis=1)
+      msgs = msgs / jnp.sum(adj_mat, axis=-1, keepdims=True)
+    elif self.reduction == jnp.max:
+      maxarg = jnp.where(jnp.expand_dims(adj_mat, -1),
+                         msgs,
+                         -BIG_NUMBER)
+      msgs = jnp.max(maxarg, axis=1)
+    else:
+      msgs = self.reduction(msgs * jnp.expand_dims(adj_mat, -1), axis=1)
+    h_1 = o1(z)
+    h_2 = o2(msgs)
+    ret = h_1 + h_2
+    if self.activation is not None:
+      ret = self.activation(ret)
+    if self.use_ln:
+      ln = hk.LayerNorm(axis=-1, create_scale=True, create_offset=True)
+      ret = ln(ret)
+    if self.gated:
+      gate1 = hk.Linear(self.out_size)
+      gate2 = hk.Linear(self.out_size)
+      gate3 = hk.Linear(self.out_size, b_init=hk.initializers.Constant(-3))
+      gate = jax.nn.sigmoid(gate3(jax.nn.relu(gate1(z) + gate2(msgs))))
+      ret = ret * gate + hidden * (1-gate)
+    return ret, tri_msgs  # pytype: disable=bad-return-type  # numpy-scalars
+class DeepSets(PGN):
+  """Deep Sets (Zaheer et al., NeurIPS 2017)."""
+  def __call__(self, node_fts: _Array, edge_fts: _Array, graph_fts: _Array,
+               adj_mat: _Array, hidden: _Array, **unused_kwargs) -> _Array:
+    assert adj_mat.ndim == 3
+    adj_mat = jnp.ones_like(adj_mat) * jnp.eye(adj_mat.shape[-1])
+    return super().__call__(node_fts, edge_fts, graph_fts, adj_mat, hidden)
+class MPNN(PGN):
+  """Message-Passing Neural Network (Gilmer et al., ICML 2017)."""
+  def __call__(self, node_fts: _Array, edge_fts: _Array, graph_fts: _Array,
+               adj_mat: _Array, hidden: _Array, **unused_kwargs) -> _Array:
+    adj_mat = jnp.ones_like(adj_mat)
+    return super().__call__(node_fts, edge_fts, graph_fts, adj_mat, hidden)
+class PGNMask(PGN):
+  """Masked Pointer Graph Networks (Veličković et al., NeurIPS 2020)."""
+  @property
+  def inf_bias(self):
+    return True
+  @property
+  def inf_bias_edge(self):
+    return True
+class MemNetMasked(Processor):
+  """Implementation of End-to-End Memory Networks.
+  Inspired by the description in https://arxiv.org/abs/1503.08895.
+  """
+  def __init__(
+      self,
+      vocab_size: int,
+      sentence_size: int,
+      linear_output_size: int,
+      embedding_size: int = 16,
+      memory_size: Optional[int] = 128,
+      num_hops: int = 1,
+      nonlin: Callable[[Any], Any] = jax.nn.relu,
+      apply_embeddings: bool = True,
+      init_func: hk.initializers.Initializer = jnp.zeros,
+      use_ln: bool = False,
+      name: str = 'memnet') -> None:
+    """Constructor.
+    Args:
+      vocab_size: the number of words in the dictionary (each story, query and
+        answer come contain symbols coming from this dictionary).
+      sentence_size: the dimensionality of each memory.
+      linear_output_size: the dimensionality of the output of the last layer
+        of the model.
+      embedding_size: the dimensionality of the latent space to where all
+        memories are projected.
+      memory_size: the number of memories provided.
+      num_hops: the number of layers in the model.
+      nonlin: non-linear transformation applied at the end of each layer.
+      apply_embeddings: flag whether to aply embeddings.
+      init_func: initialization function for the biases.
+      use_ln: whether to use layer normalisation in the model.
+      name: the name of the model.
+    """
+    super().__init__(name=name)
+    self._vocab_size = vocab_size
+    self._embedding_size = embedding_size
+    self._sentence_size = sentence_size
+    self._memory_size = memory_size
+    self._linear_output_size = linear_output_size
+    self._num_hops = num_hops
+    self._nonlin = nonlin
+    self._apply_embeddings = apply_embeddings
+    self._init_func = init_func
+    self._use_ln = use_ln
+    # Encoding part: i.e. "I" of the paper.
+    self._encodings = _position_encoding(sentence_size, embedding_size)
+  def __call__(  # pytype: disable=signature-mismatch  # numpy-scalars
+      self,
+      node_fts: _Array,
+      edge_fts: _Array,
+      graph_fts: _Array,
+      adj_mat: _Array,
+      hidden: _Array,
+      **unused_kwargs,
+  ) -> _Array:
+    """MemNet inference step."""
+    del hidden
+    node_and_graph_fts = jnp.concatenate([node_fts, graph_fts[:, None]],
+                                         axis=1)
+    edge_fts_padded = jnp.pad(edge_fts * adj_mat[..., None],
+                              ((0, 0), (0, 1), (0, 1), (0, 0)))
+    nxt_hidden = jax.vmap(self._apply, (1), 1)(node_and_graph_fts,
+                                               edge_fts_padded)
+    # Broadcast hidden state corresponding to graph features across the nodes.
+    nxt_hidden = nxt_hidden[:, :-1] + nxt_hidden[:, -1:]
+    return nxt_hidden, None  # pytype: disable=bad-return-type  # numpy-scalars
+  def _apply(self, queries: _Array, stories: _Array) -> _Array:
+    """Apply Memory Network to the queries and stories.
+    Args:
+      queries: Tensor of shape [batch_size, sentence_size].
+      stories: Tensor of shape [batch_size, memory_size, sentence_size].
+    Returns:
+      Tensor of shape [batch_size, vocab_size].
+    """
+    if self._apply_embeddings:
+      query_biases = hk.get_parameter(
+          'query_biases',
+          shape=[self._vocab_size - 1, self._embedding_size],
+          init=self._init_func)
+      stories_biases = hk.get_parameter(
+          'stories_biases',
+          shape=[self._vocab_size - 1, self._embedding_size],
+          init=self._init_func)
+      memory_biases = hk.get_parameter(
+          'memory_contents',
+          shape=[self._memory_size, self._embedding_size],
+          init=self._init_func)
+      output_biases = hk.get_parameter(
+          'output_biases',
+          shape=[self._vocab_size - 1, self._embedding_size],
+          init=self._init_func)
+      nil_word_slot = jnp.zeros([1, self._embedding_size])
+    # This is "A" in the paper.
+    if self._apply_embeddings:
+      stories_biases = jnp.concatenate([stories_biases, nil_word_slot], axis=0)
+      memory_embeddings = jnp.take(
+          stories_biases, stories.reshape([-1]).astype(jnp.int32),
+          axis=0).reshape(list(stories.shape) + [self._embedding_size])
+      memory_embeddings = jnp.pad(
+          memory_embeddings,
+          ((0, 0), (0, self._memory_size - jnp.shape(memory_embeddings)[1]),
+           (0, 0), (0, 0)))
+      memory = jnp.sum(memory_embeddings * self._encodings, 2) + memory_biases
+    else:
+      memory = stories
+    # This is "B" in the paper. Also, when there are no queries (only
+    # sentences), then there these lines are substituted by
+    # query_embeddings = 0.1.
+    if self._apply_embeddings:
+      query_biases = jnp.concatenate([query_biases, nil_word_slot], axis=0)
+      query_embeddings = jnp.take(
+          query_biases, queries.reshape([-1]).astype(jnp.int32),
+          axis=0).reshape(list(queries.shape) + [self._embedding_size])
+      # This is "u" in the paper.
+      query_input_embedding = jnp.sum(query_embeddings * self._encodings, 1)
+    else:
+      query_input_embedding = queries
+    # This is "C" in the paper.
+    if self._apply_embeddings:
+      output_biases = jnp.concatenate([output_biases, nil_word_slot], axis=0)
+      output_embeddings = jnp.take(
+          output_biases, stories.reshape([-1]).astype(jnp.int32),
+          axis=0).reshape(list(stories.shape) + [self._embedding_size])
+      output_embeddings = jnp.pad(
+          output_embeddings,
+          ((0, 0), (0, self._memory_size - jnp.shape(output_embeddings)[1]),
+           (0, 0), (0, 0)))
+      output = jnp.sum(output_embeddings * self._encodings, 2)
+    else:
+      output = stories
+    intermediate_linear = hk.Linear(self._embedding_size, with_bias=False)
+    # Output_linear is "H".
+    output_linear = hk.Linear(self._linear_output_size, with_bias=False)
+    for hop_number in range(self._num_hops):
+      query_input_embedding_transposed = jnp.transpose(
+          jnp.expand_dims(query_input_embedding, -1), [0, 2, 1])
+      # Calculate probabilities.
+      probs = jax.nn.softmax(
+          jnp.sum(memory * query_input_embedding_transposed, 2))
+      # Calculate output of the layer by multiplying by C.
+      transposed_probs = jnp.transpose(jnp.expand_dims(probs, -1), [0, 2, 1])
+      transposed_output_embeddings = jnp.transpose(output, [0, 2, 1])
+      # This is "o" in the paper.
+      layer_output = jnp.sum(transposed_output_embeddings * transposed_probs, 2)
+      # Finally the answer
+      if hop_number == self._num_hops - 1:
+        # Please note that in the TF version we apply the final linear layer
+        # in all hops and this results in shape mismatches.
+        output_layer = output_linear(query_input_embedding + layer_output)
+      else:
+        output_layer = intermediate_linear(query_input_embedding + layer_output)
+      query_input_embedding = output_layer
+      if self._nonlin:
+        output_layer = self._nonlin(output_layer)
+    # This linear here is "W".
+    ret = hk.Linear(self._vocab_size, with_bias=False)(output_layer)
+    if self._use_ln:
+      ln = hk.LayerNorm(axis=-1, create_scale=True, create_offset=True)
+      ret = ln(ret)
+    return ret
+class MemNetFull(MemNetMasked):
+  """Memory Networks with full adjacency matrix."""
+  def __call__(self, node_fts: _Array, edge_fts: _Array, graph_fts: _Array,
+               adj_mat: _Array, hidden: _Array, **unused_kwargs) -> _Array:
+    adj_mat = jnp.ones_like(adj_mat)
+    return super().__call__(node_fts, edge_fts, graph_fts, adj_mat, hidden)
+ProcessorFactory = Callable[[int], Processor]
+def get_processor_factory(kind: str,
+                          use_ln: bool,
+                          nb_triplet_fts: int,
+                          nb_heads: Optional[int] = None) -> ProcessorFactory:
+  """Returns a processor factory.
+  Args:
+    kind: One of the available types of processor.
+    use_ln: Whether the processor passes the output through a layernorm layer.
+    nb_triplet_fts: How many triplet features to compute.
+    nb_heads: Number of attention heads for GAT processors.
+  Returns:
+    A callable that takes an `out_size` parameter (equal to the hidden
+    dimension of the network) and returns a processor instance.
+  """
+  def _factory(out_size: int):
+    if kind == 'deepsets':
+      processor = DeepSets(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=False,
+          nb_triplet_fts=0
+      )
+    elif kind == 'gat':
+      processor = GAT(
+          out_size=out_size,
+          nb_heads=nb_heads,
+          use_ln=use_ln,
+      )
+    elif kind == 'gat_full':
+      processor = GATFull(
+          out_size=out_size,
+          nb_heads=nb_heads,
+          use_ln=use_ln
+      )
+    elif kind == 'gatv2':
+      processor = GATv2(
+          out_size=out_size,
+          nb_heads=nb_heads,
+          use_ln=use_ln
+      )
+    elif kind == 'gatv2_full':
+      processor = GATv2Full(
+          out_size=out_size,
+          nb_heads=nb_heads,
+          use_ln=use_ln
+      )
+    elif kind == 'memnet_full':
+      processor = MemNetFull(
+          vocab_size=out_size,
+          sentence_size=out_size,
+          linear_output_size=out_size,
+      )
+    elif kind == 'memnet_masked':
+      processor = MemNetMasked(
+          vocab_size=out_size,
+          sentence_size=out_size,
+          linear_output_size=out_size,
+      )
+    elif kind == 'mpnn':
+      processor = MPNN(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=False,
+          nb_triplet_fts=0,
+      )
+    elif kind == 'pgn':
+      processor = PGN(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=False,
+          nb_triplet_fts=0,
+      )
+    elif kind == 'pgn_mask':
+      processor = PGNMask(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=False,
+          nb_triplet_fts=0,
+      )
+    elif kind == 'triplet_mpnn':
+      processor = MPNN(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=True,
+          nb_triplet_fts=nb_triplet_fts,
+      )
+    elif kind == 'triplet_pgn':
+      processor = PGN(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=True,
+          nb_triplet_fts=nb_triplet_fts,
+      )
+    elif kind == 'triplet_pgn_mask':
+      processor = PGNMask(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=True,
+          nb_triplet_fts=nb_triplet_fts,
+      )
+    elif kind == 'gpgn':
+      processor = PGN(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=False,
+          nb_triplet_fts=nb_triplet_fts,
+          gated=True,
+      )
+    elif kind == 'gpgn_mask':
+      processor = PGNMask(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=False,
+          nb_triplet_fts=nb_triplet_fts,
+          gated=True,
+      )
+    elif kind == 'gmpnn':
+      processor = MPNN(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=False,
+          nb_triplet_fts=nb_triplet_fts,
+          gated=True,
+      )
+    elif kind == 'triplet_gpgn':
+      processor = PGN(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=True,
+          nb_triplet_fts=nb_triplet_fts,
+          gated=True,
+      )
+    elif kind == 'triplet_gpgn_mask':
+      processor = PGNMask(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=True,
+          nb_triplet_fts=nb_triplet_fts,
+          gated=True,
+      )
+    elif kind == 'triplet_gmpnn':
+      processor = MPNN(
+          out_size=out_size,
+          msgs_mlp_sizes=[out_size, out_size],
+          use_ln=use_ln,
+          use_triplets=True,
+          nb_triplet_fts=nb_triplet_fts,
+          gated=True,
+      )
+    else:
+      raise ValueError('Unexpected processor kind ' + kind)
+    return processor
+  return _factory
+def _position_encoding(sentence_size: int, embedding_size: int) -> np.ndarray:
+  """Position Encoding described in section 4.1 [1]."""
+  encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
+  ls = sentence_size + 1
+  le = embedding_size + 1
+  for i in range(1, le):
+    for j in range(1, ls):
+      encoding[i - 1, j - 1] = (i - (le - 1) / 2) * (j - (ls - 1) / 2)
+  encoding = 1 + 4 * encoding / embedding_size / sentence_size
+  return np.transpose(encoding)

benchmarks/CLRS/env/processors_test.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for processors.py."""
+from absl.testing import absltest
+import chex
+from clrs._src import processors
+import haiku as hk
+import jax.numpy as jnp
+class MemnetTest(absltest.TestCase):
+  def test_simple_run_and_check_shapes(self):
+    batch_size = 64
+    vocab_size = 177
+    embedding_size = 64
+    sentence_size = 11
+    memory_size = 320
+    linear_output_size = 128
+    num_hops = 2
+    use_ln = True
+    def forward_fn(queries, stories):
+      model = processors.MemNetFull(
+          vocab_size=vocab_size,
+          embedding_size=embedding_size,
+          sentence_size=sentence_size,
+          memory_size=memory_size,
+          linear_output_size=linear_output_size,
+          num_hops=num_hops,
+          use_ln=use_ln)
+      return model._apply(queries, stories)
+    forward = hk.transform(forward_fn)
+    queries = jnp.ones([batch_size, sentence_size], dtype=jnp.int32)
+    stories = jnp.ones([batch_size, memory_size, sentence_size],
+                       dtype=jnp.int32)
+    key = hk.PRNGSequence(42)
+    params = forward.init(next(key), queries, stories)
+    model_output = forward.apply(params, None, queries, stories)
+    chex.assert_shape(model_output, [batch_size, vocab_size])
+    chex.assert_type(model_output, jnp.float32)
+if __name__ == '__main__':
+  absltest.main()

benchmarks/CLRS/env/samplers.py ADDED Viewed

	@@ -0,0 +1,882 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sampling utilities."""
+import abc
+import collections
+import inspect
+import types
+from typing import Any, Callable, List, Optional, Tuple
+from absl import logging
+from clrs._src import algorithms
+from clrs._src import probing
+from clrs._src import specs
+import jax
+import numpy as np
+_Array = np.ndarray
+_DataPoint = probing.DataPoint
+Trajectory = List[_DataPoint]
+Trajectories = List[Trajectory]
+Algorithm = Callable[..., Any]
+Features = collections.namedtuple('Features', ['inputs', 'hints', 'lengths'])
+FeaturesChunked = collections.namedtuple(
+    'Features', ['inputs', 'hints', 'is_first', 'is_last'])
+Feedback = collections.namedtuple('Feedback', ['features', 'outputs'])
+# CLRS-30 baseline spec.
+CLRS30 = types.MappingProxyType({
+    'train': {
+        'num_samples': 1000,
+        'length': 16,
+        'seed': 1,
+    },
+    'val': {
+        'num_samples': 32,
+        'length': 16,
+        'seed': 2,
+    },
+    'test': {
+        'num_samples': 32,
+        'length': 64,
+        'seed': 3,
+    },
+})
+class Sampler(abc.ABC):
+  """Sampler abstract base class."""
+  def __init__(
+      self,
+      algorithm: Algorithm,
+      spec: specs.Spec,
+      num_samples: int,
+      *args,
+      seed: Optional[int] = None,
+      **kwargs,
+  ):
+    """Initializes a `Sampler`.
+    Args:
+      algorithm: The algorithm to sample from
+      spec: The algorithm spec.
+      num_samples: Number of algorithm unrolls to sample. If positive, all the
+        samples will be generated in the constructor, and at each call
+        of the `next` method a batch will be randomly selected among them.
+        If -1, samples are generated on the fly with each call to `next`.
+      *args: Algorithm args.
+      seed: RNG seed.
+      **kwargs: Algorithm kwargs.
+    """
+    # Use `RandomState` to ensure deterministic sampling across Numpy versions.
+    self._rng = np.random.RandomState(seed)
+    self._spec = spec
+    self._num_samples = num_samples
+    self._algorithm = algorithm
+    self._args = args
+    self._kwargs = kwargs
+    if num_samples < 0:
+      logging.warning('Sampling dataset on-the-fly, unlimited samples.')
+      # Just get an initial estimate of max hint length
+      self.max_steps = -1
+      for _ in range(1000):
+        data = self._sample_data(*args, **kwargs)
+        _, probes = algorithm(*data)
+        _, _, hint = probing.split_stages(probes, spec)
+        for dp in hint:
+          assert dp.data.shape[1] == 1  # batching axis
+          if dp.data.shape[0] > self.max_steps:
+            self.max_steps = dp.data.shape[0]
+    else:
+      logging.info('Creating a dataset with %i samples.', num_samples)
+      (self._inputs, self._outputs, self._hints,
+       self._lengths) = self._make_batch(num_samples, spec, 0, algorithm, *args,
+                                         **kwargs)
+  def _make_batch(self, num_samples: int, spec: specs.Spec, min_length: int,
+                  algorithm: Algorithm, *args, **kwargs):
+    """Generate a batch of data."""
+    inputs = []
+    outputs = []
+    hints = []
+    for _ in range(num_samples):
+      data = self._sample_data(*args, **kwargs)
+      _, probes = algorithm(*data)
+      inp, outp, hint = probing.split_stages(probes, spec)
+      inputs.append(inp)
+      outputs.append(outp)
+      hints.append(hint)
+      if len(hints) % 1000 == 0:
+        logging.info('%i samples created', len(hints))
+    # Batch and pad trajectories to max(T).
+    inputs = _batch_io(inputs)
+    outputs = _batch_io(outputs)
+    hints, lengths = _batch_hints(hints, min_length)
+    return inputs, outputs, hints, lengths
+  def next(self, batch_size: Optional[int] = None) -> Feedback:
+    """Subsamples trajectories from the pre-generated dataset.
+    Args:
+      batch_size: Optional batch size. If `None`, returns entire dataset.
+    Returns:
+      Subsampled trajectories.
+    """
+    if batch_size:
+      if self._num_samples < 0:  # generate on the fly
+        inputs, outputs, hints, lengths = self._make_batch(
+            batch_size, self._spec, self.max_steps,
+            self._algorithm, *self._args, **self._kwargs)
+        if hints[0].data.shape[0] > self.max_steps:
+          logging.warning('Increasing hint lengh from %i to %i',
+                          self.max_steps, hints[0].data.shape[0])
+          self.max_steps = hints[0].data.shape[0]
+      else:
+        if batch_size > self._num_samples:
+          raise ValueError(
+              f'Batch size {batch_size} > dataset size {self._num_samples}.')
+        # Returns a fixed-size random batch.
+        indices = self._rng.choice(self._num_samples, (batch_size,),
+                                   replace=True)
+        inputs = _subsample_data(self._inputs, indices, axis=0)
+        outputs = _subsample_data(self._outputs, indices, axis=0)
+        hints = _subsample_data(self._hints, indices, axis=1)
+        lengths = self._lengths[indices]
+    else:
+      # Returns the full dataset.
+      assert self._num_samples >= 0
+      inputs = self._inputs
+      hints = self._hints
+      lengths = self._lengths
+      outputs = self._outputs
+    return Feedback(Features(inputs, hints, lengths), outputs)
+  @abc.abstractmethod
+  def _sample_data(self, length: int, *args, **kwargs) -> List[_Array]:
+    pass
+  def _random_sequence(self, length, low=0.0, high=1.0):
+    """Random sequence."""
+    return self._rng.uniform(low=low, high=high, size=(length,))
+  def _random_string(self, length, chars=4):
+    """Random string."""
+    return self._rng.randint(0, high=chars, size=(length,))
+  def _random_er_graph(self, nb_nodes, p=0.5, directed=False, acyclic=False,
+                       weighted=False, low=0.0, high=1.0):
+    """Random Erdos-Renyi graph."""
+    mat = self._rng.binomial(1, p, size=(nb_nodes, nb_nodes))
+    if not directed:
+      mat *= np.transpose(mat)
+    elif acyclic:
+      mat = np.triu(mat, k=1)
+      p = self._rng.permutation(nb_nodes)  # To allow nontrivial solutions
+      mat = mat[p, :][:, p]
+    if weighted:
+      weights = self._rng.uniform(low=low, high=high, size=(nb_nodes, nb_nodes))
+      if not directed:
+        weights *= np.transpose(weights)
+        weights = np.sqrt(weights + 1e-3)  # Add epsilon to protect underflow
+      mat = mat.astype(float) * weights
+    return mat
+  def _random_community_graph(self, nb_nodes, k=4, p=0.5, eps=0.01,
+                              directed=False, acyclic=False, weighted=False,
+                              low=0.0, high=1.0):
+    """Random perturbed k-community graph."""
+    mat = np.zeros((nb_nodes, nb_nodes))
+    if k > nb_nodes:
+      raise ValueError(f'Cannot generate graph of too many ({k}) communities.')
+    los, his = [], []
+    lo = 0
+    for i in range(k):
+      if i == k - 1:
+        hi = nb_nodes
+      else:
+        hi = lo + nb_nodes // k
+      mat[lo:hi, lo:hi] = self._random_er_graph(
+          hi - lo, p=p, directed=directed,
+          acyclic=acyclic, weighted=weighted,
+          low=low, high=high)
+      los.append(lo)
+      his.append(hi)
+      lo = hi
+    toggle = self._random_er_graph(nb_nodes, p=eps, directed=directed,
+                                   acyclic=acyclic, weighted=weighted,
+                                   low=low, high=high)
+    # Prohibit closing new cycles
+    for i in range(k):
+      for j in range(i):
+        toggle[los[i]:his[i], los[j]:his[j]] *= 0
+    mat = np.where(toggle > 0.0, (1.0 - (mat > 0.0)) * toggle, mat)
+    p = self._rng.permutation(nb_nodes)  # To allow nontrivial solutions
+    mat = mat[p, :][:, p]
+    return mat
+  def _random_bipartite_graph(self, n, m, p=0.25):
+    """Random bipartite graph-based flow network."""
+    nb_nodes = n + m + 2
+    s = 0
+    t = n + m + 1
+    mat = np.zeros((nb_nodes, nb_nodes))
+    mat[s, 1:n+1] = 1.0  # supersource
+    mat[n+1:n+m+1, t] = 1.0  # supersink
+    mat[1:n+1, n+1:n+m+1] = self._rng.binomial(1, p, size=(n, m))
+    return mat
+def build_sampler(
+    name: str,
+    num_samples: int,
+    *args,
+    seed: Optional[int] = None,
+    **kwargs,
+) -> Tuple[Sampler, specs.Spec]:
+  """Builds a sampler. See `Sampler` documentation."""
+  if name not in specs.SPECS or name not in SAMPLERS:
+    raise NotImplementedError(f'No implementation of algorithm {name}.')
+  spec = specs.SPECS[name]
+  algorithm = getattr(algorithms, name)
+  sampler_class = SAMPLERS[name]
+  # Ignore kwargs not accepted by the sampler.
+  sampler_args = inspect.signature(sampler_class._sample_data).parameters  # pylint:disable=protected-access
+  clean_kwargs = {k: kwargs[k] for k in kwargs if k in sampler_args}
+  if set(clean_kwargs) != set(kwargs):
+    logging.warning('Ignoring kwargs %s when building sampler class %s',
+                    set(kwargs).difference(clean_kwargs), sampler_class)
+  sampler = sampler_class(algorithm, spec, num_samples, seed=seed,
+                          *args, **clean_kwargs)
+  return sampler, spec
+class SortingSampler(Sampler):
+  """Sorting sampler. Generates a random sequence of U[0, 1]."""
+  def _sample_data(
+      self,
+      length: int,
+      low: float = 0.,
+      high: float = 1.,
+  ):
+    arr = self._random_sequence(length=length, low=low, high=high)
+    return [arr]
+class SearchSampler(Sampler):
+  """Search sampler. Generates a random sequence and target (of U[0, 1])."""
+  def _sample_data(
+      self,
+      length: int,
+      low: float = 0.,
+      high: float = 1.,
+  ):
+    arr = self._random_sequence(length=length, low=low, high=high)
+    arr.sort()
+    x = self._rng.uniform(low=low, high=high)
+    return [x, arr]
+class MaxSubarraySampler(Sampler):
+  """Maximum subarray sampler. Generates a random sequence of U[-1, 1]."""
+  def _sample_data(
+      self,
+      length: int,
+      low: float = -1.,
+      high: float = 1.,
+  ):
+    arr = self._random_sequence(length=length, low=low, high=high)
+    return [arr]
+class LCSSampler(Sampler):
+  """Longest Common Subsequence sampler. Generates two random ATCG strings."""
+  def _sample_data(
+      self,
+      length: int,
+      length_2: Optional[int] = None,
+      chars: int = 4,
+  ):
+    if length_2 is None:
+      # Assume provided length is total length.
+      length_2 = length // 2
+      length -= length_2
+    a = self._random_string(length=length, chars=chars)
+    b = self._random_string(length=length_2, chars=chars)
+    return [a, b]
+class OptimalBSTSampler(Sampler):
+  """Optimal BST sampler. Samples array of probabilities, splits it into two."""
+  def _sample_data(
+      self,
+      length: int,
+  ):
+    tot_length = length + (length + 1)
+    arr = self._random_sequence(length=tot_length, low=0.0, high=1.0)
+    arr /= np.sum(arr)
+    p = arr[:length]
+    q = arr[length:]
+    return [p, q]
+class ActivitySampler(Sampler):
+  """Activity sampler. Samples start and finish times from U[0, 1]."""
+  def _sample_data(
+      self,
+      length: int,
+      low: float = 0.,
+      high: float = 1.,
+  ):
+    arr_1 = self._random_sequence(length=length, low=low, high=high)
+    arr_2 = self._random_sequence(length=length, low=low, high=high)
+    return [np.minimum(arr_1, arr_2), np.maximum(arr_1, arr_2)]
+class TaskSampler(Sampler):
+  """Task sampler. Samples deadlines (integers) and values (U[0, 1])."""
+  def _sample_data(
+      self,
+      length: int,
+      max_deadline: Optional[int] = None,
+      low: float = 0.,
+      high: float = 1.,
+  ):
+    if max_deadline is None:
+      max_deadline = length
+    d = self._random_string(length=length, chars=max_deadline) + 1
+    w = self._random_sequence(length=length, low=low, high=high)
+    return [d, w]
+class DfsSampler(Sampler):
+  """DFS sampler."""
+  def _sample_data(
+      self,
+      length: int,
+      p: Tuple[float, ...] = (0.5,),
+  ):
+    graph = self._random_er_graph(
+        nb_nodes=length, p=self._rng.choice(p),
+        directed=True, acyclic=False, weighted=False)
+    return [graph]
+class BfsSampler(Sampler):
+  """BFS sampler."""
+  def _sample_data(
+      self,
+      length: int,
+      p: Tuple[float, ...] = (0.5,),
+  ):
+    graph = self._random_er_graph(
+        nb_nodes=length, p=self._rng.choice(p),
+        directed=False, acyclic=False, weighted=False)
+    source_node = self._rng.choice(length)
+    return [graph, source_node]
+class TopoSampler(Sampler):
+  """Topological Sorting sampler."""
+  def _sample_data(
+      self,
+      length: int,
+      p: Tuple[float, ...] = (0.5,),
+  ):
+    graph = self._random_er_graph(
+        nb_nodes=length, p=self._rng.choice(p),
+        directed=True, acyclic=True, weighted=False)
+    return [graph]
+class ArticulationSampler(Sampler):
+  """Articulation Point sampler."""
+  def _sample_data(
+      self,
+      length: int,
+      p: Tuple[float, ...] = (0.2,),
+  ):
+    graph = self._random_er_graph(
+        nb_nodes=length, p=self._rng.choice(p), directed=False,
+        acyclic=False, weighted=False)
+    return [graph]
+class MSTSampler(Sampler):
+  """MST sampler for Kruskal's algorithm."""
+  def _sample_data(
+      self,
+      length: int,
+      p: Tuple[float, ...] = (0.2,),  # lower p to account for class imbalance
+      low: float = 0.,
+      high: float = 1.,
+  ):
+    graph = self._random_er_graph(
+        nb_nodes=length,
+        p=self._rng.choice(p),
+        directed=False,
+        acyclic=False,
+        weighted=True,
+        low=low,
+        high=high)
+    return [graph]
+class BellmanFordSampler(Sampler):
+  """Bellman-Ford sampler."""
+  def _sample_data(
+      self,
+      length: int,
+      p: Tuple[float, ...] = (0.5,),
+      low: float = 0.,
+      high: float = 1.,
+  ):
+    graph = self._random_er_graph(
+        nb_nodes=length,
+        p=self._rng.choice(p),
+        directed=False,
+        acyclic=False,
+        weighted=True,
+        low=low,
+        high=high)
+    source_node = self._rng.choice(length)
+    return [graph, source_node]
+class DAGPathSampler(Sampler):
+  """Sampler for DAG shortest paths."""
+  def _sample_data(
+      self,
+      length: int,
+      p: Tuple[float, ...] = (0.5,),
+      low: float = 0.,
+      high: float = 1.,
+  ):
+    graph = self._random_er_graph(
+        nb_nodes=length,
+        p=self._rng.choice(p),
+        directed=True,
+        acyclic=True,
+        weighted=True,
+        low=low,
+        high=high)
+    source_node = self._rng.choice(length)
+    return [graph, source_node]
+class FloydWarshallSampler(Sampler):
+  """Sampler for all-pairs shortest paths."""
+  def _sample_data(
+      self,
+      length: int,
+      p: Tuple[float, ...] = (0.5,),
+      low: float = 0.,
+      high: float = 1.,
+  ):
+    graph = self._random_er_graph(
+        nb_nodes=length,
+        p=self._rng.choice(p),
+        directed=False,
+        acyclic=False,
+        weighted=True,
+        low=low,
+        high=high)
+    return [graph]
+class SccSampler(Sampler):
+  """Sampler for strongly connected component (SCC) tasks."""
+  def _sample_data(
+      self,
+      length: int,
+      k: int = 4,
+      p: Tuple[float, ...] = (0.5,),
+      eps: float = 0.01,
+  ):
+    graph = self._random_community_graph(
+        nb_nodes=length, k=k, p=self._rng.choice(p), eps=eps,
+        directed=True, acyclic=False, weighted=False)
+    return [graph]
+class BipartiteSampler(Sampler):
+  """Sampler for bipartite matching-based flow networks."""
+  def _sample_data(
+      self,
+      length: int,
+      length_2: Optional[int] = None,
+      p: Tuple[float, ...] = (0.3,),
+  ):
+    if length_2 is None:
+      # Assume provided length is total length.
+      length_2 = length // 2
+      length -= length_2
+    graph = self._random_bipartite_graph(n=length, m=length_2,
+                                         p=self._rng.choice(p))
+    return [graph, length, length_2, 0, length + length_2 + 1]
+class MatcherSampler(Sampler):
+  """String matching sampler; embeds needle in a random haystack."""
+  def _sample_data(
+      self,
+      length: int,  # length of haystack + needle, i.e., total number of nodes
+      length_needle: Optional[int] = None,
+      chars: int = 4,
+  ):
+    if length_needle is None:
+      if length < 5:
+        length_needle = 1
+      else:
+        length_needle = length // 5
+    elif length_needle < 0:  # randomize needle length
+      length_needle = self._rng.randint(1, high=1 - length_needle)
+    length_haystack = length - length_needle
+    needle = self._random_string(length=length_needle, chars=chars)
+    haystack = self._random_string(length=length_haystack, chars=chars)
+    embed_pos = self._rng.choice(length_haystack - length_needle)
+    haystack[embed_pos:embed_pos + length_needle] = needle
+    return [haystack, needle]
+class SegmentsSampler(Sampler):
+  """Two-segment sampler of points from (U[0, 1], U[0, 1])."""
+  def _sample_data(self, length: int, low: float = 0., high: float = 1.):
+    del length  # There are exactly four endpoints.
+    # Quick CCW check (ignoring collinearity) for rejection sampling
+    def ccw(x_a, y_a, x_b, y_b, x_c, y_c):
+      return (y_c - y_a) * (x_b - x_a) > (y_b - y_a) * (x_c - x_a)
+    def intersect(xs, ys):
+      return ccw(xs[0], ys[0], xs[2], ys[2], xs[3], ys[3]) != ccw(
+          xs[1], ys[1], xs[2], ys[2], xs[3], ys[3]) and ccw(
+              xs[0], ys[0], xs[1], ys[1], xs[2], ys[2]) != ccw(
+                  xs[0], ys[0], xs[1], ys[1], xs[3], ys[3])
+    # Decide (with uniform probability) should this sample intersect
+    coin_flip = self._rng.binomial(1, 0.5)
+    xs = self._random_sequence(length=4, low=low, high=high)
+    ys = self._random_sequence(length=4, low=low, high=high)
+    while intersect(xs, ys) != coin_flip:
+      xs = self._random_sequence(length=4, low=low, high=high)
+      ys = self._random_sequence(length=4, low=low, high=high)
+    return [xs, ys]
+class ConvexHullSampler(Sampler):
+  """Convex hull sampler of points over a disk of radius r."""
+  def _sample_data(self, length: int, origin_x: float = 0.,
+                   origin_y: float = 0., radius: float = 2.):
+    thetas = self._random_sequence(length=length, low=0.0, high=2.0 * np.pi)
+    rs = radius * np.sqrt(
+        self._random_sequence(length=length, low=0.0, high=1.0))
+    xs = rs * np.cos(thetas) + origin_x
+    ys = rs * np.sin(thetas) + origin_y
+    return [xs, ys]
+SAMPLERS = {
+    'insertion_sort': SortingSampler,
+    'bubble_sort': SortingSampler,
+    'heapsort': SortingSampler,
+    'quicksort': SortingSampler,
+    'quickselect': SortingSampler,
+    'minimum': SortingSampler,
+    'binary_search': SearchSampler,
+    'find_maximum_subarray': MaxSubarraySampler,
+    'find_maximum_subarray_kadane': MaxSubarraySampler,
+    'matrix_chain_order': SortingSampler,
+    'lcs_length': LCSSampler,
+    'optimal_bst': OptimalBSTSampler,
+    'activity_selector': ActivitySampler,
+    'task_scheduling': TaskSampler,
+    'dfs': DfsSampler,
+    'topological_sort': TopoSampler,
+    'strongly_connected_components': SccSampler,
+    'articulation_points': ArticulationSampler,
+    'bridges': ArticulationSampler,
+    'bfs': BfsSampler,
+    'mst_kruskal': MSTSampler,
+    'mst_prim': BellmanFordSampler,
+    'bellman_ford': BellmanFordSampler,
+    'dag_shortest_paths': DAGPathSampler,
+    'dijkstra': BellmanFordSampler,
+    'floyd_warshall': FloydWarshallSampler,
+    'bipartite_matching': BipartiteSampler,
+    'naive_string_matcher': MatcherSampler,
+    'kmp_matcher': MatcherSampler,
+    'segments_intersect': SegmentsSampler,
+    'graham_scan': ConvexHullSampler,
+    'jarvis_march': ConvexHullSampler,
+}
+def _batch_io(traj_io: Trajectories) -> Trajectory:
+  """Batches a trajectory of input/output samples along the time axis per probe.
+  Args:
+    traj_io:  An i/o trajectory of `DataPoint`s indexed by time then probe.
+  Returns:
+    A |num probes| list of `DataPoint`s with the time axis stacked into `data`.
+  """
+  assert traj_io  # non-empty
+  for sample_io in traj_io:
+    for i, dp in enumerate(sample_io):
+      assert dp.data.shape[0] == 1  # batching axis
+      assert traj_io[0][i].name == dp.name
+  return jax.tree_util.tree_map(lambda *x: np.concatenate(x), *traj_io)
+def _batch_hints(
+    traj_hints: Trajectories, min_steps: int) -> Tuple[Trajectory, List[int]]:
+  """Batches a trajectory of hints samples along the time axis per probe.
+  Unlike i/o, hints have a variable-length time dimension. Before batching, each
+  trajectory is padded to the maximum trajectory length.
+  Args:
+    traj_hints: A hint trajectory of `DataPoints`s indexed by time then probe
+    min_steps: Hints will be padded at least to this length - if any hint is
+      longer than this, the greater length will be used.
+  Returns:
+    A |num probes| list of `DataPoint`s with the time axis stacked into `data`,
+    and a |sample| list containing the length of each trajectory.
+  """
+  max_steps = min_steps
+  assert traj_hints  # non-empty
+  for sample_hint in traj_hints:
+    for dp in sample_hint:
+      assert dp.data.shape[1] == 1  # batching axis
+      if dp.data.shape[0] > max_steps:
+        max_steps = dp.data.shape[0]
+  time_and_batch = (max_steps, len(traj_hints))
+  # Create zero-filled space for the batched hints, then copy each hint
+  # up to the corresponding length.
+  batched_traj = jax.tree_util.tree_map(
+      lambda x: np.zeros(time_and_batch + x.shape[2:]),
+      traj_hints[0])
+  hint_lengths = np.zeros(len(traj_hints))
+  for sample_idx, cur_sample in enumerate(traj_hints):
+    for i in range(len(cur_sample)):
+      assert batched_traj[i].name == cur_sample[i].name
+      cur_data = cur_sample[i].data
+      cur_length = cur_data.shape[0]
+      batched_traj[i].data[:cur_length, sample_idx:sample_idx+1] = cur_data
+      if i > 0:
+        assert hint_lengths[sample_idx] == cur_length
+      else:
+        hint_lengths[sample_idx] = cur_length
+  return batched_traj, hint_lengths
+def _subsample_data(
+    trajectory: Trajectory,
+    idx: List[int],
+    axis: int = 0,
+) -> Trajectory:
+  """New `Trajectory` where each `DataPoint`'s data is subsampled along axis."""
+  sampled_traj = []
+  for dp in trajectory:
+    sampled_data = np.take(dp.data, idx, axis=axis)
+    sampled_traj.append(
+        probing.DataPoint(dp.name, dp.location, dp.type_, sampled_data))
+  return sampled_traj
+def _preprocess_permutations(probes, enforce_permutations):
+  """Replace should-be permutations with proper permutation pointer + mask."""
+  output = []
+  for x in probes:
+    if x.type_ != specs.Type.SHOULD_BE_PERMUTATION:
+      output.append(x)
+      continue
+    assert x.location == specs.Location.NODE
+    if enforce_permutations:
+      new_x, mask = probing.predecessor_to_cyclic_predecessor_and_first(x.data)
+      output.append(
+          probing.DataPoint(
+              name=x.name,
+              location=x.location,
+              type_=specs.Type.PERMUTATION_POINTER,
+              data=new_x))
+      output.append(
+          probing.DataPoint(
+              name=x.name + '_mask',
+              location=x.location,
+              type_=specs.Type.MASK_ONE,
+              data=mask))
+    else:
+      output.append(probing.DataPoint(name=x.name, location=x.location,
+                                      type_=specs.Type.POINTER, data=x.data))
+  return output
+def process_permutations(spec, sample_iterator, enforce_permutations):
+  """Replace should-be permutations with proper permutation pointer + mask."""
+  def _iterate():
+    while True:
+      feedback = next(sample_iterator)
+      features = feedback.features
+      inputs = _preprocess_permutations(features.inputs, enforce_permutations)
+      hints = _preprocess_permutations(features.hints, enforce_permutations)
+      outputs = _preprocess_permutations(feedback.outputs, enforce_permutations)
+      features = features._replace(inputs=tuple(inputs),
+                                   hints=tuple(hints))
+      feedback = feedback._replace(features=features,
+                                   outputs=outputs)
+      yield feedback
+  new_spec = {}
+  for k in spec:
+    if (spec[k][1] == specs.Location.NODE and
+        spec[k][2] == specs.Type.SHOULD_BE_PERMUTATION):
+      if enforce_permutations:
+        new_spec[k] = (spec[k][0], spec[k][1], specs.Type.PERMUTATION_POINTER)
+        new_spec[k + '_mask'] = (spec[k][0], spec[k][1], specs.Type.MASK_ONE)
+      else:
+        new_spec[k] = (spec[k][0], spec[k][1], specs.Type.POINTER)
+    else:
+      new_spec[k] = spec[k]
+  return new_spec, _iterate()
+def process_pred_as_input(spec, sample_iterator):
+  """Move pred_h hint to pred input."""
+  def _iterate():
+    while True:
+      feedback = next(sample_iterator)
+      features = feedback.features
+      pred_h = [h for h in features.hints if h.name == 'pred_h']
+      if pred_h:
+        assert len(pred_h) == 1
+        pred_h = pred_h[0]
+        hints = [h for h in features.hints if h.name != 'pred_h']
+        for i in range(len(features.lengths)):
+          assert np.sum(np.abs(pred_h.data[1:int(features.lengths[i]), i] -
+                               pred_h.data[0, i])) == 0.0
+        inputs = tuple(features.inputs) + (
+            probing.DataPoint(name='pred', location=pred_h.location,
+                              type_=pred_h.type_, data=pred_h.data[0]),)
+        features = features._replace(inputs=tuple(inputs),
+                                     hints=tuple(hints))
+        feedback = feedback._replace(features=features)
+      yield feedback
+  new_spec = {}
+  for k in spec:
+    if k == 'pred_h':
+      assert spec[k] == (specs.Stage.HINT, specs.Location.NODE,
+                         specs.Type.POINTER)
+      new_spec['pred'] = (specs.Stage.INPUT, specs.Location.NODE,
+                          specs.Type.POINTER)
+    else:
+      new_spec[k] = spec[k]
+  return new_spec, _iterate()
+def process_random_pos(sample_iterator, rng):
+  """Randomize the `pos` input from a sampler.
+  The `pos` input is, by default, a scalar uniformly spaced between 0 and 1
+  across the nodes. The exception are string algorithms (naive_string_matcher,
+  kmp_string_matcher and lcs_length), where the `pos` sequence is split into
+  needle and haystack (or first and second string, for lcs_length). Here
+  we replace the uniformly spaced `pos` with an ordered sequence of random
+  scalars, or, for string algorithms, two ordered sequences of random scalars.
+  Args:
+    sample_iterator: An iterator producing samples with non-random `pos` inputs.
+    rng: Numpy random generator
+  Returns:
+    An iterator returning the samples with randomized `pos` inputs.
+  """
+  def _iterate():
+    while True:
+      feedback = next(sample_iterator)
+      inputs = feedback.features.inputs
+      pos, = [x for x in inputs if x.name == 'pos']
+      batch_size, num_nodes = pos.data.shape
+      unsorted = rng.uniform(size=(batch_size, num_nodes))
+      new_pos = []
+      for i in range(batch_size):  # we check one example at a time.
+        # We find if there are splits in the pos sequence, marked by zeros.
+        # We know there will always be at least 1 zero, if there's no split.
+        split, = np.where(pos.data[i] == 0)
+        split = np.concatenate([split, [num_nodes]])
+        # We construct the randomized pos by sorting the random values in each
+        # split and concatenating them.
+        new_pos.append(
+            np.concatenate([np.sort(unsorted[i, split[j]:split[j+1]])
+                            for j in range(len(split) - 1)]))
+      pos.data = np.array(new_pos)
+      inputs = [(pos if x.name == 'pos' else x) for x in inputs]
+      features = feedback.features._replace(inputs=inputs)
+      feedback = feedback._replace(features=features)
+      yield feedback
+  return _iterate()

benchmarks/CLRS/env/samplers_test.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for `samplers.py`."""
+from absl.testing import absltest
+from absl.testing import parameterized
+import chex
+from clrs._src import probing
+from clrs._src import samplers
+from clrs._src import specs
+import jax
+import numpy as np
+class SamplersTest(parameterized.TestCase):
+  @parameterized.parameters(*specs.CLRS_30_ALGS)
+  def test_sampler_determinism(self, name):
+    num_samples = 3
+    num_nodes = 10
+    sampler, _ = samplers.build_sampler(name, num_samples, num_nodes)
+    np.random.seed(47)  # Set seed
+    feedback = sampler.next()
+    expected = feedback.outputs[0].data.copy()
+    np.random.seed(48)  # Set a different seed
+    feedback = sampler.next()
+    actual = feedback.outputs[0].data.copy()
+    # Validate that datasets are the same.
+    np.testing.assert_array_equal(expected, actual)
+  @parameterized.parameters(*specs.CLRS_30_ALGS)
+  def test_sampler_batch_determinism(self, name):
+    num_samples = 10
+    batch_size = 5
+    num_nodes = 10
+    seed = 0
+    sampler_1, _ = samplers.build_sampler(
+        name, num_samples, num_nodes, seed=seed)
+    sampler_2, _ = samplers.build_sampler(
+        name, num_samples, num_nodes, seed=seed)
+    feedback_1 = sampler_1.next(batch_size)
+    feedback_2 = sampler_2.next(batch_size)
+    # Validate that datasets are the same.
+    jax.tree_util.tree_map(np.testing.assert_array_equal, feedback_1,
+                           feedback_2)
+  def test_end_to_end(self):
+    num_samples = 7
+    num_nodes = 3
+    sampler, _ = samplers.build_sampler("bfs", num_samples, num_nodes)
+    feedback = sampler.next()
+    inputs = feedback.features.inputs
+    self.assertLen(inputs, 4)
+    self.assertEqual(inputs[0].name, "pos")
+    self.assertEqual(inputs[0].data.shape, (num_samples, num_nodes))
+    outputs = feedback.outputs
+    self.assertLen(outputs, 1)
+    self.assertEqual(outputs[0].name, "pi")
+    self.assertEqual(outputs[0].data.shape, (num_samples, num_nodes))
+  def test_batch_size(self):
+    num_samples = 7
+    num_nodes = 3
+    sampler, _ = samplers.build_sampler("bfs", num_samples, num_nodes)
+    # Full-batch.
+    feedback = sampler.next()
+    for dp in feedback.features.inputs:  # [B, ...]
+      self.assertEqual(dp.data.shape[0], num_samples)
+    for dp in feedback.outputs:  # [B, ...]
+      self.assertEqual(dp.data.shape[0], num_samples)
+    for dp in feedback.features.hints:  # [T, B, ...]
+      self.assertEqual(dp.data.shape[1], num_samples)
+    self.assertLen(feedback.features.lengths, num_samples)
+    # Specified batch.
+    batch_size = 5
+    feedback = sampler.next(batch_size)
+    for dp in feedback.features.inputs:  # [B, ...]
+      self.assertEqual(dp.data.shape[0], batch_size)
+    for dp in feedback.outputs:  # [B, ...]
+      self.assertEqual(dp.data.shape[0], batch_size)
+    for dp in feedback.features.hints:  # [T, B, ...]
+      self.assertEqual(dp.data.shape[1], batch_size)
+    self.assertLen(feedback.features.lengths, batch_size)
+  def test_batch_io(self):
+    sample = [
+        probing.DataPoint(
+            name="x",
+            location=specs.Location.NODE,
+            type_=specs.Type.SCALAR,
+            data=np.zeros([1, 3]),
+        ),
+        probing.DataPoint(
+            name="y",
+            location=specs.Location.EDGE,
+            type_=specs.Type.MASK,
+            data=np.zeros([1, 3, 3]),
+        ),
+    ]
+    trajectory = [sample.copy(), sample.copy(), sample.copy(), sample.copy()]
+    batched = samplers._batch_io(trajectory)
+    np.testing.assert_array_equal(batched[0].data, np.zeros([4, 3]))
+    np.testing.assert_array_equal(batched[1].data, np.zeros([4, 3, 3]))
+  def test_batch_hint(self):
+    sample0 = [
+        probing.DataPoint(
+            name="x",
+            location=specs.Location.NODE,
+            type_=specs.Type.MASK,
+            data=np.zeros([2, 1, 3]),
+        ),
+        probing.DataPoint(
+            name="y",
+            location=specs.Location.NODE,
+            type_=specs.Type.POINTER,
+            data=np.zeros([2, 1, 3]),
+        ),
+    ]
+    sample1 = [
+        probing.DataPoint(
+            name="x",
+            location=specs.Location.NODE,
+            type_=specs.Type.MASK,
+            data=np.zeros([1, 1, 3]),
+        ),
+        probing.DataPoint(
+            name="y",
+            location=specs.Location.NODE,
+            type_=specs.Type.POINTER,
+            data=np.zeros([1, 1, 3]),
+        ),
+    ]
+    trajectory = [sample0, sample1]
+    batched, lengths = samplers._batch_hints(trajectory, 0)
+    np.testing.assert_array_equal(batched[0].data, np.zeros([2, 2, 3]))
+    np.testing.assert_array_equal(batched[1].data, np.zeros([2, 2, 3]))
+    np.testing.assert_array_equal(lengths, np.array([2, 1]))
+    batched, lengths = samplers._batch_hints(trajectory, 5)
+    np.testing.assert_array_equal(batched[0].data, np.zeros([5, 2, 3]))
+    np.testing.assert_array_equal(batched[1].data, np.zeros([5, 2, 3]))
+    np.testing.assert_array_equal(lengths, np.array([2, 1]))
+  def test_padding(self):
+    lens = np.random.choice(10, (10,), replace=True) + 1
+    trajectory = []
+    for len_ in lens:
+      trajectory.append([
+          probing.DataPoint(
+              name="x",
+              location=specs.Location.NODE,
+              type_=specs.Type.MASK,
+              data=np.ones([len_, 1, 3]),
+          )
+      ])
+    batched, lengths = samplers._batch_hints(trajectory, 0)
+    np.testing.assert_array_equal(lengths, lens)
+    for i in range(len(lens)):
+      ones = batched[0].data[:lens[i], i, :]
+      zeros = batched[0].data[lens[i]:, i, :]
+      np.testing.assert_array_equal(ones, np.ones_like(ones))
+      np.testing.assert_array_equal(zeros, np.zeros_like(zeros))
+class ProcessRandomPosTest(parameterized.TestCase):
+  @parameterized.parameters(["insertion_sort", "naive_string_matcher"])
+  def test_random_pos(self, algorithm_name):
+    batch_size, length = 12, 10
+    def _make_sampler():
+      sampler, _ = samplers.build_sampler(
+          algorithm_name,
+          seed=0,
+          num_samples=100,
+          length=length,
+          )
+      while True:
+        yield sampler.next(batch_size)
+    sampler_1 = _make_sampler()
+    sampler_2 = _make_sampler()
+    sampler_2 = samplers.process_random_pos(sampler_2, np.random.RandomState(0))
+    batch_without_rand_pos = next(sampler_1)
+    batch_with_rand_pos = next(sampler_2)
+    pos_idx = [x.name for x in batch_without_rand_pos.features.inputs].index(
+        "pos")
+    fixed_pos = batch_without_rand_pos.features.inputs[pos_idx]
+    rand_pos = batch_with_rand_pos.features.inputs[pos_idx]
+    self.assertEqual(rand_pos.location, specs.Location.NODE)
+    self.assertEqual(rand_pos.type_, specs.Type.SCALAR)
+    self.assertEqual(rand_pos.data.shape, (batch_size, length))
+    self.assertEqual(rand_pos.data.shape, fixed_pos.data.shape)
+    self.assertEqual(rand_pos.type_, fixed_pos.type_)
+    self.assertEqual(rand_pos.location, fixed_pos.location)
+    assert (rand_pos.data.std(axis=0) > 1e-3).all()
+    assert (fixed_pos.data.std(axis=0) < 1e-9).all()
+    if "string" in algorithm_name:
+      expected = np.concatenate([np.arange(4*length//5)/(4*length//5),
+                                 np.arange(length//5)/(length//5)])
+    else:
+      expected = np.arange(length)/length
+    np.testing.assert_array_equal(
+        fixed_pos.data, np.broadcast_to(expected, (batch_size, length)))
+    batch_with_rand_pos.features.inputs[pos_idx] = fixed_pos
+    chex.assert_trees_all_equal(batch_with_rand_pos, batch_without_rand_pos)
+if __name__ == "__main__":
+  absltest.main()

benchmarks/CLRS/env/specs.py ADDED Viewed

	@@ -0,0 +1,525 @@

+# Copyright 2021 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Algorithm specs.
+The "spec" of each algorithm is a static set of `(stage, loc, type)`-tuples.
+- `stage`: One of either an `input`, `output` or `hint`
+- `location`: Each datum is associated with either the `node`, `edge` or `graph`
+- `type`: Either a `scalar`, `categorical`, `mask`, `mask_one` or `pointer`
+The dataflow for an algorithm is represented by `(stage, loc, type, data)`
+"probes" that are valid under that algorithm's spec. It contains a single
+snapshot for each `input` and `output` and a time-series of intermediate
+algorithmic states (`hint`).
+At minimum, each node contains a `pos` probe that serves as a unique index e.g.
+for representing sequential data where appropriate
+"""
+import types
+from typing import Dict, Tuple
+class Stage:
+  INPUT = 'input'
+  OUTPUT = 'output'
+  HINT = 'hint'
+class Location:
+  NODE = 'node'
+  EDGE = 'edge'
+  GRAPH = 'graph'
+class Type:
+  SCALAR = 'scalar'
+  CATEGORICAL = 'categorical'
+  MASK = 'mask'
+  MASK_ONE = 'mask_one'
+  POINTER = 'pointer'
+  SHOULD_BE_PERMUTATION = 'should_be_permutation'
+  PERMUTATION_POINTER = 'permutation_pointer'
+  SOFT_POINTER = 'soft_pointer'
+class OutputClass:
+  POSITIVE = 1
+  NEGATIVE = 0
+  MASKED = -1
+Spec = Dict[str, Tuple[str, str, str]]
+CLRS_30_ALGS = [
+    'articulation_points',
+    'activity_selector',
+    'bellman_ford',
+    'bfs',
+    'binary_search',
+    'bridges',
+    'bubble_sort',
+    'dag_shortest_paths',
+    'dfs',
+    'dijkstra',
+    'find_maximum_subarray_kadane',
+    'floyd_warshall',
+    'graham_scan',
+    'heapsort',
+    'insertion_sort',
+    'jarvis_march',
+    'kmp_matcher',
+    'lcs_length',
+    'matrix_chain_order',
+    'minimum',
+    'mst_kruskal',
+    'mst_prim',
+    'naive_string_matcher',
+    'optimal_bst',
+    'quickselect',
+    'quicksort',
+    'segments_intersect',
+    'strongly_connected_components',
+    'task_scheduling',
+    'topological_sort',
+]
+ALGO_IDX_INPUT_NAME = 'algo_idx'
+# Algorithms have varying numbers of signals they are evaluated on.
+# To compensate for that, we issue more samples for those who use a small
+# number of signals.
+CLRS_30_ALGS_SETTINGS = {alg: {'num_samples_multiplier': 1}
+                         for alg in CLRS_30_ALGS}
+CLRS_30_ALGS_SETTINGS['find_maximum_subarray_kadane'][
+    'num_samples_multiplier'] = 32
+for alg in ['quickselect', 'minimum', 'binary_search', 'naive_string_matcher',
+            'kmp_matcher', 'segments_intersect']:
+  CLRS_30_ALGS_SETTINGS[alg]['num_samples_multiplier'] = 64
+SPECS = types.MappingProxyType({
+    'insertion_sort': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'pred': (Stage.OUTPUT, Location.NODE, Type.SHOULD_BE_PERMUTATION),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'j': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'bubble_sort': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'pred': (Stage.OUTPUT, Location.NODE, Type.SHOULD_BE_PERMUTATION),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'j': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'heapsort': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'pred': (Stage.OUTPUT, Location.NODE, Type.SHOULD_BE_PERMUTATION),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'parent': (Stage.HINT, Location.NODE, Type.POINTER),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'j': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'largest': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'heap_size': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'phase': (Stage.HINT, Location.GRAPH, Type.CATEGORICAL)
+    },
+    'quicksort': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'pred': (Stage.OUTPUT, Location.NODE, Type.SHOULD_BE_PERMUTATION),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'p': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'r': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'j': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'quickselect': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'median': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'p': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'r': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'j': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'i_rank': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'target': (Stage.HINT, Location.GRAPH, Type.SCALAR)
+    },
+    'minimum': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'min': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'min_h': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'binary_search': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'target': (Stage.INPUT, Location.GRAPH, Type.SCALAR),
+        'return': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'low': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'high': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'mid': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'find_maximum_subarray': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'start': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'end': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'low': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'high': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'mid': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'left_low': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'left_high': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'left_sum': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'right_low': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'right_high': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'right_sum': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'cross_low': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'cross_high': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'cross_sum': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'ret_low': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'ret_high': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'ret_sum': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'j': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'sum': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'left_x_sum': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'right_x_sum': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'phase': (Stage.HINT, Location.GRAPH, Type.CATEGORICAL)
+    },
+    'find_maximum_subarray_kadane': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'start': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'end': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'best_low': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'best_high': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'best_sum': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'j': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'sum': (Stage.HINT, Location.GRAPH, Type.SCALAR)
+    },
+    'matrix_chain_order': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'p': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        's': (Stage.OUTPUT, Location.EDGE, Type.POINTER),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'm': (Stage.HINT, Location.EDGE, Type.SCALAR),
+        's_h': (Stage.HINT, Location.EDGE, Type.POINTER),
+        'msk': (Stage.HINT, Location.EDGE, Type.MASK)
+    },
+    'lcs_length': {
+        'string': (Stage.INPUT, Location.NODE, Type.MASK),
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.CATEGORICAL),
+        'b': (Stage.OUTPUT, Location.EDGE, Type.CATEGORICAL),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'b_h': (Stage.HINT, Location.EDGE, Type.CATEGORICAL),
+        'c': (Stage.HINT, Location.EDGE, Type.SCALAR)
+    },
+    'optimal_bst': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'p': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'q': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'root': (Stage.OUTPUT, Location.EDGE, Type.POINTER),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'root_h': (Stage.HINT, Location.EDGE, Type.POINTER),
+        'e': (Stage.HINT, Location.EDGE, Type.SCALAR),
+        'w': (Stage.HINT, Location.EDGE, Type.SCALAR),
+        'msk': (Stage.HINT, Location.EDGE, Type.MASK)
+    },
+    'activity_selector': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        's': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'f': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'selected': (Stage.OUTPUT, Location.NODE, Type.MASK),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'selected_h': (Stage.HINT, Location.NODE, Type.MASK),
+        'm': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'k': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'task_scheduling': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'd': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'w': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'selected': (Stage.OUTPUT, Location.NODE, Type.MASK),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'selected_h': (Stage.HINT, Location.NODE, Type.MASK),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        't': (Stage.HINT, Location.GRAPH, Type.SCALAR)
+    },
+    'dfs': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'pi': (Stage.OUTPUT, Location.NODE, Type.POINTER),
+        'pi_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'color': (Stage.HINT, Location.NODE, Type.CATEGORICAL),
+        'd': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'f': (Stage.HINT, Location.NODE, Type.SCALAR),
+        's_prev': (Stage.HINT, Location.NODE, Type.POINTER),
+        's': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'v': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        's_last': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'time': (Stage.HINT, Location.GRAPH, Type.SCALAR)
+    },
+    'topological_sort': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'topo': (Stage.OUTPUT, Location.NODE, Type.POINTER),
+        'topo_head': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'topo_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'topo_head_h': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'color': (Stage.HINT, Location.NODE, Type.CATEGORICAL),
+        's_prev': (Stage.HINT, Location.NODE, Type.POINTER),
+        's': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'v': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        's_last': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'strongly_connected_components': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'scc_id': (Stage.OUTPUT, Location.NODE, Type.POINTER),
+        'scc_id_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'A_t': (Stage.HINT, Location.EDGE, Type.MASK),
+        'color': (Stage.HINT, Location.NODE, Type.CATEGORICAL),
+        'd': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'f': (Stage.HINT, Location.NODE, Type.SCALAR),
+        's_prev': (Stage.HINT, Location.NODE, Type.POINTER),
+        's': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'v': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        's_last': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'time': (Stage.HINT, Location.GRAPH, Type.SCALAR),
+        'phase': (Stage.HINT, Location.GRAPH, Type.MASK)
+    },
+    'articulation_points': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'is_cut': (Stage.OUTPUT, Location.NODE, Type.MASK),
+        'is_cut_h': (Stage.HINT, Location.NODE, Type.MASK),
+        'pi_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'color': (Stage.HINT, Location.NODE, Type.CATEGORICAL),
+        'd': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'f': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'low': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'child_cnt': (Stage.HINT, Location.NODE, Type.SCALAR),
+        's_prev': (Stage.HINT, Location.NODE, Type.POINTER),
+        's': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'v': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        's_last': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'time': (Stage.HINT, Location.GRAPH, Type.SCALAR)
+    },
+    'bridges': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'is_bridge': (Stage.OUTPUT, Location.EDGE, Type.MASK),
+        'is_bridge_h': (Stage.HINT, Location.EDGE, Type.MASK),
+        'pi_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'color': (Stage.HINT, Location.NODE, Type.CATEGORICAL),
+        'd': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'f': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'low': (Stage.HINT, Location.NODE, Type.SCALAR),
+        's_prev': (Stage.HINT, Location.NODE, Type.POINTER),
+        's': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'v': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        's_last': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'time': (Stage.HINT, Location.GRAPH, Type.SCALAR)
+    },
+    'bfs': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        's': (Stage.INPUT, Location.NODE, Type.MASK_ONE),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'pi': (Stage.OUTPUT, Location.NODE, Type.POINTER),
+        'reach_h': (Stage.HINT, Location.NODE, Type.MASK),
+        'pi_h': (Stage.HINT, Location.NODE, Type.POINTER)
+    },
+    'mst_kruskal': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'in_mst': (Stage.OUTPUT, Location.EDGE, Type.MASK),
+        'in_mst_h': (Stage.HINT, Location.EDGE, Type.MASK),
+        'pi': (Stage.HINT, Location.NODE, Type.POINTER),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'v': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'root_u': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'root_v': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'mask_u': (Stage.HINT, Location.NODE, Type.MASK),
+        'mask_v': (Stage.HINT, Location.NODE, Type.MASK),
+        'phase': (Stage.HINT, Location.GRAPH, Type.CATEGORICAL)
+    },
+    'mst_prim': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        's': (Stage.INPUT, Location.NODE, Type.MASK_ONE),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'pi': (Stage.OUTPUT, Location.NODE, Type.POINTER),
+        'pi_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'key': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'mark': (Stage.HINT, Location.NODE, Type.MASK),
+        'in_queue': (Stage.HINT, Location.NODE, Type.MASK),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'bellman_ford': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        's': (Stage.INPUT, Location.NODE, Type.MASK_ONE),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'pi': (Stage.OUTPUT, Location.NODE, Type.POINTER),
+        'pi_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'd': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'msk': (Stage.HINT, Location.NODE, Type.MASK)
+    },
+    'dag_shortest_paths': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        's': (Stage.INPUT, Location.NODE, Type.MASK_ONE),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'pi': (Stage.OUTPUT, Location.NODE, Type.POINTER),
+        'pi_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'd': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'mark': (Stage.HINT, Location.NODE, Type.MASK),
+        'topo_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'topo_head_h': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'color': (Stage.HINT, Location.NODE, Type.CATEGORICAL),
+        's_prev': (Stage.HINT, Location.NODE, Type.POINTER),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'v': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        's_last': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'phase': (Stage.HINT, Location.GRAPH, Type.MASK)
+    },
+    'dijkstra': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        's': (Stage.INPUT, Location.NODE, Type.MASK_ONE),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'pi': (Stage.OUTPUT, Location.NODE, Type.POINTER),
+        'pi_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'd': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'mark': (Stage.HINT, Location.NODE, Type.MASK),
+        'in_queue': (Stage.HINT, Location.NODE, Type.MASK),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'floyd_warshall': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        'Pi': (Stage.OUTPUT, Location.EDGE, Type.POINTER),
+        'Pi_h': (Stage.HINT, Location.EDGE, Type.POINTER),
+        'D': (Stage.HINT, Location.EDGE, Type.SCALAR),
+        'msk': (Stage.HINT, Location.EDGE, Type.MASK),
+        'k': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'bipartite_matching': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'A': (Stage.INPUT, Location.EDGE, Type.SCALAR),
+        'adj': (Stage.INPUT, Location.EDGE, Type.MASK),
+        's': (Stage.INPUT, Location.NODE, Type.MASK_ONE),
+        't': (Stage.INPUT, Location.NODE, Type.MASK_ONE),
+        'in_matching': (Stage.OUTPUT, Location.EDGE, Type.MASK),
+        'in_matching_h': (Stage.HINT, Location.EDGE, Type.MASK),
+        'A_h': (Stage.HINT, Location.EDGE, Type.SCALAR),
+        'adj_h': (Stage.HINT, Location.EDGE, Type.MASK),
+        'd': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'msk': (Stage.HINT, Location.NODE, Type.MASK),
+        'pi': (Stage.HINT, Location.NODE, Type.POINTER),
+        'u': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'phase': (Stage.HINT, Location.GRAPH, Type.MASK)
+    },
+    'naive_string_matcher': {
+        'string': (Stage.INPUT, Location.NODE, Type.MASK),
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.CATEGORICAL),
+        'match': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        's': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'j': (Stage.HINT, Location.NODE, Type.MASK_ONE)
+    },
+    'kmp_matcher': {
+        'string': (Stage.INPUT, Location.NODE, Type.MASK),
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'key': (Stage.INPUT, Location.NODE, Type.CATEGORICAL),
+        'match': (Stage.OUTPUT, Location.NODE, Type.MASK_ONE),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'pi': (Stage.HINT, Location.NODE, Type.POINTER),
+        'is_reset': (Stage.HINT, Location.NODE, Type.MASK),
+        'k': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'k_reset': (Stage.HINT, Location.GRAPH, Type.MASK),
+        'q': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'q_reset': (Stage.HINT, Location.GRAPH, Type.MASK),
+        's': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'phase': (Stage.HINT, Location.GRAPH, Type.MASK)
+    },
+    'segments_intersect': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'x': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'y': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'intersect': (Stage.OUTPUT, Location.GRAPH, Type.MASK),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'j': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'k': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'dir': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'on_seg': (Stage.HINT, Location.NODE, Type.MASK)
+    },
+    'graham_scan': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'x': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'y': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'in_hull': (Stage.OUTPUT, Location.NODE, Type.MASK),
+        'best': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'atans': (Stage.HINT, Location.NODE, Type.SCALAR),
+        'in_hull_h': (Stage.HINT, Location.NODE, Type.MASK),
+        'stack_prev': (Stage.HINT, Location.NODE, Type.POINTER),
+        'last_stack': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'phase': (Stage.HINT, Location.GRAPH, Type.CATEGORICAL)
+    },
+    'jarvis_march': {
+        'pos': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'x': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'y': (Stage.INPUT, Location.NODE, Type.SCALAR),
+        'in_hull': (Stage.OUTPUT, Location.NODE, Type.MASK),
+        'pred_h': (Stage.HINT, Location.NODE, Type.POINTER),
+        'in_hull_h': (Stage.HINT, Location.NODE, Type.MASK),
+        'best': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'last_point': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'endpoint': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'i': (Stage.HINT, Location.NODE, Type.MASK_ONE),
+        'phase': (Stage.HINT, Location.GRAPH, Type.CATEGORICAL)
+    }
+})

benchmarks/CLRS/env/train.py ADDED Viewed

	@@ -0,0 +1,560 @@

+# Copyright 2022 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run training of one or more algorithmic tasks from CLRS."""
+import os
+# disable logging until training starts
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import functools
+import os
+import shutil
+from typing import Any, Dict, List
+from absl import app
+from absl import flags
+from absl import logging
+# disable logging until training starts
+logging.set_verbosity(logging.ERROR)
+import clrs
+import jax
+import numpy as np
+import requests
+import tensorflow as tf
+from baselines import BaselineModel, BaselineModelChunked
+import pickle
+import copy
+flags.DEFINE_list('algorithms', ['floyd_warshall'], 'Which algorithms to run.')
+flags.DEFINE_list('train_lengths', ['4', '7', '11', '13', '16'],
+                  'Which training sizes to use. A size of -1 means '
+                  'use the benchmark dataset.')
+flags.DEFINE_integer('length_needle', -8,
+                     'Length of needle for training and validation '
+                     '(not testing) in string matching algorithms. '
+                     'A negative value randomizes the length for each sample '
+                     'between 1 and the opposite of the value. '
+                     'A value of 0 means use always 1/4 of the length of '
+                     'the haystack (the default sampler behavior).')
+flags.DEFINE_integer('seed', 42, 'Random seed to set')
+flags.DEFINE_boolean('random_pos', True,
+                     'Randomize the pos input common to all algos.')
+flags.DEFINE_boolean('enforce_permutations', True,
+                     'Whether to enforce permutation-type node pointers.')
+flags.DEFINE_boolean('enforce_pred_as_input', True,
+                     'Whether to change pred_h hints into pred inputs.')
+flags.DEFINE_integer('batch_size', 32, 'Batch size used for training.')
+flags.DEFINE_boolean('chunked_training', False,
+                     'Whether to use chunking for training.')
+flags.DEFINE_integer('chunk_length', 16,
+                     'Time chunk length used for training (if '
+                     '`chunked_training` is True.')
+flags.DEFINE_integer('train_steps', 500, 'Number of training iterations.')
+flags.DEFINE_integer('eval_every', 50, 'Evaluation frequency (in steps).')
+flags.DEFINE_integer('test_every', 500, 'Evaluation frequency (in steps).')
+flags.DEFINE_integer('log_every', 50, 'Logging frequency (in steps).')
+flags.DEFINE_integer('hidden_size', 128,
+                     'Number of hidden units of the model.')
+flags.DEFINE_integer('nb_heads', 1, 'Number of heads for GAT processors')
+flags.DEFINE_integer('nb_msg_passing_steps', 1,
+                     'Number of message passing steps to run per hint.')
+flags.DEFINE_float('learning_rate', 0.001, 'Learning rate to use.')
+flags.DEFINE_float('grad_clip_max_norm', 1.0,
+                   'Gradient clipping by norm. 0.0 disables grad clipping')
+flags.DEFINE_float('dropout_prob', 0.0, 'Dropout rate to use.')
+flags.DEFINE_float('hint_teacher_forcing', 0.0,
+                   'Probability that ground-truth teacher hints are encoded '
+                   'during training instead of predicted hints. Only '
+                   'pertinent in encoded_decoded modes.')
+flags.DEFINE_enum('hint_mode', 'encoded_decoded',
+                  ['encoded_decoded', 'decoded_only', 'none'],
+                  'How should hints be used? Note, each mode defines a '
+                  'separate task, with various difficulties. `encoded_decoded` '
+                  'requires the model to explicitly materialise hint sequences '
+                  'and therefore is hardest, but also most aligned to the '
+                  'underlying algorithmic rule. Hence, `encoded_decoded` '
+                  'should be treated as the default mode for our benchmark. '
+                  'In `decoded_only`, hints are only used for defining '
+                  'reconstruction losses. Often, this will perform well, but '
+                  'note that we currently do not make any efforts to '
+                  'counterbalance the various hint losses. Hence, for certain '
+                  'tasks, the best performance will now be achievable with no '
+                  'hint usage at all (`none`).')
+flags.DEFINE_enum('hint_repred_mode', 'soft', ['soft', 'hard', 'hard_on_eval'],
+                  'How to process predicted hints when fed back as inputs.'
+                  'In soft mode, we use softmaxes for categoricals, pointers '
+                  'and mask_one, and sigmoids for masks. '
+                  'In hard mode, we use argmax instead of softmax, and hard '
+                  'thresholding of masks. '
+                  'In hard_on_eval mode, soft mode is '
+                  'used for training and hard mode is used for evaluation.')
+flags.DEFINE_boolean('use_ln', True,
+                     'Whether to use layer normalisation in the processor.')
+flags.DEFINE_boolean('use_lstm', False,
+                     'Whether to insert an LSTM after message passing.')
+flags.DEFINE_integer('nb_triplet_fts', 8,
+                     'How many triplet features to compute?')
+flags.DEFINE_enum('encoder_init', 'xavier_on_scalars',
+                  ['default', 'xavier_on_scalars'],
+                  'Initialiser to use for the encoders.')
+flags.DEFINE_enum('processor_type', 'triplet_gmpnn',
+                  ['deepsets', 'mpnn', 'pgn', 'pgn_mask',
+                   'triplet_mpnn', 'triplet_pgn', 'triplet_pgn_mask',
+                   'gat', 'gatv2', 'gat_full', 'gatv2_full',
+                   'gpgn', 'gpgn_mask', 'gmpnn',
+                   'triplet_gpgn', 'triplet_gpgn_mask', 'triplet_gmpnn'],
+                  'Processor type to use as the network P.')
+flags.DEFINE_string('checkpoint_path', './checkpoints',
+                    'Path in which checkpoints are saved.')
+flags.DEFINE_string('dataset_path', '/tmp/CLRS30',
+                    'Path in which dataset is stored.')
+flags.DEFINE_boolean('freeze_processor', False,
+                     'Whether to freeze the processor of the model.')
+FLAGS = flags.FLAGS
+PRED_AS_INPUT_ALGOS = [
+    'binary_search',
+    'minimum',
+    'find_maximum_subarray',
+    'find_maximum_subarray_kadane',
+    'matrix_chain_order',
+    'lcs_length',
+    'optimal_bst',
+    'activity_selector',
+    'task_scheduling',
+    'naive_string_matcher',
+    'kmp_matcher',
+    'jarvis_march']
+def unpack(v):
+  try:
+    return v.item()  # DeviceArray
+  except (AttributeError, ValueError):
+    return v
+def _iterate_sampler(sampler, batch_size):
+  while True:
+    yield sampler.next(batch_size)
+def _maybe_download_dataset(dataset_path):
+  """Download CLRS30 dataset if needed."""
+  dataset_folder = os.path.join(dataset_path, clrs.get_clrs_folder())
+  if os.path.isdir(dataset_folder):
+    logging.info('Dataset found at %s. Skipping download.', dataset_folder)
+    return dataset_folder
+  logging.info('Dataset not found in %s. Downloading...', dataset_folder)
+  clrs_url = clrs.get_dataset_gcp_url()
+  request = requests.get(clrs_url, allow_redirects=True)
+  clrs_file = os.path.join(dataset_path, os.path.basename(clrs_url))
+  os.makedirs(dataset_folder)
+  open(clrs_file, 'wb').write(request.content)
+  shutil.unpack_archive(clrs_file, extract_dir=dataset_folder)
+  os.remove(clrs_file)
+  return dataset_folder
+def make_sampler(length: int,
+                 rng: Any,
+                 algorithm: str,
+                 split: str,
+                 batch_size: int,
+                 multiplier: int,
+                 randomize_pos: bool,
+                 enforce_pred_as_input: bool,
+                 enforce_permutations: bool,
+                 chunked: bool,
+                 chunk_length: int,
+                 sampler_kwargs: Dict[str, Any]):
+  """Create a sampler with given options.
+  Args:
+    length: Size of samples (i.e., number of nodes in the graph).
+      A length of -1 will mean that the benchmark
+      dataset (for the given split) is used. Positive sizes will instantiate
+      samplers of the corresponding size.
+    rng: Numpy random state.
+    algorithm: The name of the algorithm to sample from.
+    split: 'train', 'val' or 'test'.
+    batch_size: Samples per batch.
+    multiplier: Integer multiplier for the number of samples in the dataset,
+      only used for positive sizes. Negative multiplier means infinite samples.
+    randomize_pos: Whether to randomize the `pos` input.
+    enforce_pred_as_input: Whether to convert fixed pred_h hints to inputs.
+    enforce_permutations: Whether to enforce permutation pointers.
+    chunked: Whether to chunk the dataset.
+    chunk_length: Unroll length of chunks, if `chunked` is True.
+    sampler_kwargs: Extra args passed to the sampler.
+  Returns:
+    A sampler (iterator), the number of samples in the iterator (negative
+    if infinite samples), and the spec.
+  """
+  if length < 0:  # load from file
+    dataset_folder = _maybe_download_dataset(FLAGS.dataset_path)
+    sampler, num_samples, spec = clrs.create_dataset(folder=dataset_folder,
+                                                     algorithm=algorithm,
+                                                     batch_size=batch_size,
+                                                     split=split)
+    sampler = sampler.as_numpy_iterator()
+  else:
+    num_samples = clrs.CLRS30[split]['num_samples'] * multiplier
+    sampler, spec = clrs.build_sampler(
+        algorithm,
+        seed=rng.randint(2**32),
+        num_samples=num_samples,
+        length=length,
+        **sampler_kwargs,
+        )
+    sampler = _iterate_sampler(sampler, batch_size)
+  if randomize_pos:
+    sampler = clrs.process_random_pos(sampler, rng)
+  if enforce_pred_as_input and algorithm in PRED_AS_INPUT_ALGOS:
+    spec, sampler = clrs.process_pred_as_input(spec, sampler)
+  spec, sampler = clrs.process_permutations(spec, sampler, enforce_permutations)
+  if chunked:
+    sampler = clrs.chunkify(sampler, chunk_length)
+  return sampler, num_samples, spec
+def make_multi_sampler(sizes, rng, **kwargs):
+  """Create a sampler with cycling sample sizes."""
+  ss = []
+  tot_samples = 0
+  for length in sizes:
+    sampler, num_samples, spec = make_sampler(length, rng, **kwargs)
+    ss.append(sampler)
+    tot_samples += num_samples
+  def cycle_samplers():
+    while True:
+      for s in ss:
+        yield next(s)
+  return cycle_samplers(), tot_samples, spec
+def _concat(dps, axis):
+  return jax.tree_util.tree_map(lambda *x: np.concatenate(x, axis), *dps)
+def collect_and_eval(sampler, predict_fn, sample_count, rng_key, extras):
+  """Collect batches of output and hint preds and evaluate them."""
+  processed_samples = 0
+  preds = []
+  outputs = []
+  while processed_samples < sample_count:
+    feedback = next(sampler)
+    batch_size = feedback.outputs[0].data.shape[0]
+    outputs.append(feedback.outputs)
+    new_rng_key, rng_key = jax.random.split(rng_key)
+    cur_preds, _ = predict_fn(new_rng_key, feedback.features)
+    preds.append(cur_preds)
+    processed_samples += batch_size
+  outputs = _concat(outputs, axis=0)
+  preds = _concat(preds, axis=0)
+  out = clrs.evaluate(outputs, preds)
+  if extras:
+    out.update(extras)
+  return {k: unpack(v) for k, v in out.items()}
+def create_samplers(rng, train_lengths: List[int]):
+  """Create all the samplers."""
+  train_samplers = []
+  val_samplers = []
+  val_sample_counts = []
+  test_samplers = []
+  test_sample_counts = []
+  spec_list = []
+  for algo_idx, algorithm in enumerate(FLAGS.algorithms):
+    # Make full dataset pipeline run on CPU (including prefetching).
+    with tf.device('/cpu:0'):
+      if algorithm in ['naive_string_matcher', 'kmp_matcher']:
+        # Fixed haystack + needle; variability will be in needle
+        # Still, for chunked training, we maintain as many samplers
+        # as train lengths, since, for each length there is a separate state,
+        # and we must keep the 1:1 relationship between states and samplers.
+        max_length = max(train_lengths)
+        if max_length > 0:  # if < 0, we are using the benchmark data
+          max_length = (max_length * 5) // 4
+        train_lengths = [max_length]
+        if FLAGS.chunked_training:
+          train_lengths = train_lengths * len(train_lengths)
+      logging.info('Creating samplers for algo %s', algorithm)
+      p = tuple([0.1 + 0.1 * i for i in range(9)])
+      if p and algorithm in ['articulation_points', 'bridges',
+                             'mst_kruskal', 'bipartite_matching']:
+        # Choose a lower connection probability for the above algorithms,
+        # otherwise trajectories are very long
+        p = tuple(np.array(p) / 2)
+      length_needle = FLAGS.length_needle
+      sampler_kwargs = dict(p=p, length_needle=length_needle)
+      if length_needle == 0:
+        sampler_kwargs.pop('length_needle')
+      common_sampler_args = dict(
+          algorithm=FLAGS.algorithms[algo_idx],
+          rng=rng,
+          enforce_pred_as_input=FLAGS.enforce_pred_as_input,
+          enforce_permutations=FLAGS.enforce_permutations,
+          chunk_length=FLAGS.chunk_length,
+          )
+      train_args = dict(sizes=train_lengths,
+                        split='train',
+                        batch_size=FLAGS.batch_size,
+                        multiplier=-1,
+                        randomize_pos=FLAGS.random_pos,
+                        chunked=FLAGS.chunked_training,
+                        sampler_kwargs=sampler_kwargs,
+                        **common_sampler_args)
+      train_sampler, _, spec = make_multi_sampler(**train_args)
+      mult = clrs.CLRS_30_ALGS_SETTINGS[algorithm]['num_samples_multiplier']
+      val_args = dict(sizes=[np.amax(train_lengths)],
+                      split='val',
+                      batch_size=32,
+                      multiplier=2 * mult,
+                      randomize_pos=FLAGS.random_pos,
+                      chunked=False,
+                      sampler_kwargs=sampler_kwargs,
+                      **common_sampler_args)
+      val_sampler, val_samples, spec = make_multi_sampler(**val_args)
+      test_args = dict(sizes=[-1],
+                       split='test',
+                       batch_size=32,
+                       multiplier=2 * mult,
+                       randomize_pos=False,
+                       chunked=False,
+                       sampler_kwargs={},
+                       **common_sampler_args)
+      test_sampler, test_samples, spec = make_multi_sampler(**test_args)
+    spec_list.append(spec)
+    train_samplers.append(train_sampler)
+    val_samplers.append(val_sampler)
+    val_sample_counts.append(val_samples)
+    test_samplers.append(test_sampler)
+    test_sample_counts.append(test_samples)
+  return (train_samplers,
+          val_samplers, val_sample_counts,
+          test_samplers, test_sample_counts,
+          spec_list)
+def main(unused_argv):
+  if FLAGS.hint_mode == 'encoded_decoded':
+    encode_hints = True
+    decode_hints = True
+  elif FLAGS.hint_mode == 'decoded_only':
+    encode_hints = False
+    decode_hints = True
+  elif FLAGS.hint_mode == 'none':
+    encode_hints = False
+    decode_hints = False
+  else:
+    raise ValueError('Hint mode not in {encoded_decoded, decoded_only, none}.')
+  train_lengths = [int(x) for x in FLAGS.train_lengths]
+  rng = np.random.RandomState(FLAGS.seed)
+  rng_key = jax.random.PRNGKey(rng.randint(2**32))
+  # Create samplers
+  (train_samplers,
+   val_samplers, val_sample_counts,
+   test_samplers, test_sample_counts,
+   spec_list) = create_samplers(rng, train_lengths)
+  processor_factory = clrs.get_processor_factory(
+      FLAGS.processor_type,
+      use_ln=FLAGS.use_ln,
+      nb_triplet_fts=FLAGS.nb_triplet_fts,
+      nb_heads=FLAGS.nb_heads
+  )
+  model_params = dict(
+      processor_factory=processor_factory,
+      hidden_dim=FLAGS.hidden_size,
+      encode_hints=encode_hints,
+      decode_hints=decode_hints,
+      encoder_init=FLAGS.encoder_init,
+      use_lstm=FLAGS.use_lstm,
+      learning_rate=FLAGS.learning_rate,
+      grad_clip_max_norm=FLAGS.grad_clip_max_norm,
+      checkpoint_path=FLAGS.checkpoint_path,
+      freeze_processor=FLAGS.freeze_processor,
+      dropout_prob=FLAGS.dropout_prob,
+      hint_teacher_forcing=FLAGS.hint_teacher_forcing,
+      hint_repred_mode=FLAGS.hint_repred_mode,
+      nb_msg_passing_steps=FLAGS.nb_msg_passing_steps,
+      )
+  # save spec_list and model_params; do not change or delete!!
+  if not os.path.exists(FLAGS.checkpoint_path):
+    os.makedirs(FLAGS.checkpoint_path)
+  with open(os.path.join(FLAGS.checkpoint_path, 'spec_list.pkl'), 'wb') as f:
+    pickle.dump(spec_list, f)
+  model_params_save = copy.deepcopy(model_params)
+  model_params_save["processor_factory"] = (FLAGS.processor_type, FLAGS.use_ln, FLAGS.nb_triplet_fts, FLAGS.nb_heads)
+  with open(os.path.join(FLAGS.checkpoint_path, 'model_params.pkl'), 'wb') as f:
+    pickle.dump(model_params_save, f)
+  eval_model = BaselineModel(
+      spec=spec_list,
+      dummy_trajectory=[next(t) for t in val_samplers],
+      **model_params
+  )
+  if FLAGS.chunked_training:
+    train_model = BaselineModelChunked(
+        spec=spec_list,
+        dummy_trajectory=[next(t) for t in train_samplers],
+        **model_params
+        )
+  else:
+    train_model = eval_model
+  # Training loop.
+  best_score = -1.0
+  current_train_items = [0] * len(FLAGS.algorithms)
+  step = 0
+  next_eval = 0
+  # Make sure scores improve on first step, but not overcome best score
+  # until all algos have had at least one evaluation.
+  val_scores = [-99999.9] * len(FLAGS.algorithms)
+  length_idx = 0
+  while step < FLAGS.train_steps:
+    feedback_list = [next(t) for t in train_samplers]
+    # Initialize model.
+    if step == 0:
+      all_features = [f.features for f in feedback_list]
+      if FLAGS.chunked_training:
+        # We need to initialize the model with samples of all lengths for
+        # all algorithms. Also, we need to make sure that the order of these
+        # sample sizes is the same as the order of the actual training sizes.
+        all_length_features = [all_features] + [
+            [next(t).features for t in train_samplers]
+            for _ in range(len(train_lengths))]
+        train_model.init(all_length_features[:-1], FLAGS.seed + 1)
+      else:
+        train_model.init(all_features, FLAGS.seed + 1)
+    # Training step.
+    # enable logging now that we have initialized the model
+    logging.set_verbosity(logging.INFO)
+    for algo_idx in range(len(train_samplers)):
+      feedback = feedback_list[algo_idx]
+      rng_key, new_rng_key = jax.random.split(rng_key)
+      if FLAGS.chunked_training:
+        # In chunked training, we must indicate which training length we are
+        # using, so the model uses the correct state.
+        length_and_algo_idx = (length_idx, algo_idx)
+      else:
+        # In non-chunked training, all training lengths can be treated equally,
+        # since there is no state to maintain between batches.
+        length_and_algo_idx = algo_idx
+      cur_loss = train_model.feedback(rng_key, feedback, length_and_algo_idx)
+      rng_key = new_rng_key
+      if FLAGS.chunked_training:
+        examples_in_chunk = np.sum(feedback.features.is_last).item()
+      else:
+        examples_in_chunk = len(feedback.features.lengths)
+      current_train_items[algo_idx] += examples_in_chunk
+      if step % FLAGS.log_every == 0:
+        logging.info('Algo %s step %i current loss %f, current_train_items %i.',
+                    FLAGS.algorithms[algo_idx], step,
+                    cur_loss, current_train_items[algo_idx])
+    # Periodically evaluate model
+    if step >= next_eval:
+      eval_model.params = train_model.params
+      for algo_idx in range(len(train_samplers)):
+        common_extras = {'examples_seen': current_train_items[algo_idx],
+                         'step': step,
+                         'algorithm': FLAGS.algorithms[algo_idx]}
+        # Validation info.
+        new_rng_key, rng_key = jax.random.split(rng_key)
+        val_stats = collect_and_eval(
+            val_samplers[algo_idx],
+            functools.partial(eval_model.predict, algorithm_index=algo_idx),
+            val_sample_counts[algo_idx],
+            new_rng_key,
+            extras=common_extras)
+        logging.info('(val) algo %s step %d: %s',
+                     FLAGS.algorithms[algo_idx], step, val_stats)
+        val_scores[algo_idx] = val_stats['score']
+      next_eval += FLAGS.eval_every
+      # If best total score, update best checkpoint.
+      # Also save a best checkpoint on the first step.
+      msg = (f'best avg val score was '
+             f'{best_score/len(FLAGS.algorithms):.3f}, '
+             f'current avg val score is {np.mean(val_scores):.3f}, '
+             f'val scores are: ')
+      msg += ', '.join(
+          ['%s: %.3f' % (x, y) for (x, y) in zip(FLAGS.algorithms, val_scores)])
+      if (sum(val_scores) > best_score) or step == 0:
+        best_score = sum(val_scores)
+        logging.info('Checkpointing best model, %s', msg)
+        train_model.save_model('best.pkl')
+      else:
+        logging.info('Not saving new best model, %s', msg)
+    step += 1
+    length_idx = (length_idx + 1) % len(train_lengths)
+  logging.info('Restoring best model from checkpoint...')
+  eval_model.restore_model('best.pkl', only_load_processor=False)
+  for algo_idx in range(len(train_samplers)):
+    common_extras = {'examples_seen': current_train_items[algo_idx],
+                     'step': step,
+                     'algorithm': FLAGS.algorithms[algo_idx]}
+    new_rng_key, rng_key = jax.random.split(rng_key)
+    test_stats = collect_and_eval(
+        test_samplers[algo_idx],
+        functools.partial(eval_model.predict, algorithm_index=algo_idx),
+        test_sample_counts[algo_idx],
+        new_rng_key,
+        extras=common_extras)
+    logging.info('(test) algo %s : %s', FLAGS.algorithms[algo_idx], test_stats)
+  logging.info('Done!')
+if __name__ == '__main__':
+  app.run(main)

benchmarks/CLRS/scripts/eval.py ADDED Viewed

	@@ -0,0 +1,454 @@

+# Copyright 2022 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run training of one or more algorithmic tasks from CLRS."""
+import os
+# disable logging until training starts
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import functools
+import os
+import shutil
+from typing import Any, Dict, List
+from absl import app
+from absl import flags
+from absl import logging
+# disable logging until training starts
+logging.set_verbosity(logging.ERROR)
+import clrs
+import jax
+import numpy as np
+import requests
+import tensorflow as tf
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../env")))
+from baselines import BaselineModel, BaselineModelChunked
+import pickle
+flags.DEFINE_list('algorithms', ['floyd_warshall'], 'Which algorithms to run.')
+flags.DEFINE_list('train_lengths', ['4', '7', '11', '13', '16'],
+                  'Which training sizes to use. A size of -1 means '
+                  'use the benchmark dataset.')
+flags.DEFINE_integer('length_needle', -8,
+                     'Length of needle for training and validation '
+                     '(not testing) in string matching algorithms. '
+                     'A negative value randomizes the length for each sample '
+                     'between 1 and the opposite of the value. '
+                     'A value of 0 means use always 1/4 of the length of '
+                     'the haystack (the default sampler behavior).')
+flags.DEFINE_integer('seed', 42, 'Random seed to set')
+flags.DEFINE_boolean('random_pos', True,
+                     'Randomize the pos input common to all algos.')
+flags.DEFINE_boolean('enforce_permutations', True,
+                     'Whether to enforce permutation-type node pointers.')
+flags.DEFINE_boolean('enforce_pred_as_input', True,
+                     'Whether to change pred_h hints into pred inputs.')
+flags.DEFINE_integer('batch_size', 32, 'Batch size used for training.')
+flags.DEFINE_boolean('chunked_training', False,
+                     'Whether to use chunking for training.')
+flags.DEFINE_integer('chunk_length', 16,
+                     'Time chunk length used for training (if '
+                     '`chunked_training` is True.')
+flags.DEFINE_integer('train_steps', 1000, 'Number of training iterations.')
+flags.DEFINE_integer('eval_every', 50, 'Evaluation frequency (in steps).')
+flags.DEFINE_integer('test_every', 500, 'Evaluation frequency (in steps).')
+flags.DEFINE_integer('log_every', 50, 'Logging frequency (in steps).')
+flags.DEFINE_integer('hidden_size', 128,
+                     'Number of hidden units of the model.')
+flags.DEFINE_integer('nb_heads', 1, 'Number of heads for GAT processors')
+flags.DEFINE_integer('nb_msg_passing_steps', 1,
+                     'Number of message passing steps to run per hint.')
+flags.DEFINE_float('learning_rate', 0.001, 'Learning rate to use.')
+flags.DEFINE_float('grad_clip_max_norm', 1.0,
+                   'Gradient clipping by norm. 0.0 disables grad clipping')
+flags.DEFINE_float('dropout_prob', 0.0, 'Dropout rate to use.')
+flags.DEFINE_float('hint_teacher_forcing', 0.0,
+                   'Probability that ground-truth teacher hints are encoded '
+                   'during training instead of predicted hints. Only '
+                   'pertinent in encoded_decoded modes.')
+flags.DEFINE_enum('hint_mode', 'encoded_decoded',
+                  ['encoded_decoded', 'decoded_only', 'none'],
+                  'How should hints be used? Note, each mode defines a '
+                  'separate task, with various difficulties. `encoded_decoded` '
+                  'requires the model to explicitly materialise hint sequences '
+                  'and therefore is hardest, but also most aligned to the '
+                  'underlying algorithmic rule. Hence, `encoded_decoded` '
+                  'should be treated as the default mode for our benchmark. '
+                  'In `decoded_only`, hints are only used for defining '
+                  'reconstruction losses. Often, this will perform well, but '
+                  'note that we currently do not make any efforts to '
+                  'counterbalance the various hint losses. Hence, for certain '
+                  'tasks, the best performance will now be achievable with no '
+                  'hint usage at all (`none`).')
+flags.DEFINE_enum('hint_repred_mode', 'soft', ['soft', 'hard', 'hard_on_eval'],
+                  'How to process predicted hints when fed back as inputs.'
+                  'In soft mode, we use softmaxes for categoricals, pointers '
+                  'and mask_one, and sigmoids for masks. '
+                  'In hard mode, we use argmax instead of softmax, and hard '
+                  'thresholding of masks. '
+                  'In hard_on_eval mode, soft mode is '
+                  'used for training and hard mode is used for evaluation.')
+flags.DEFINE_boolean('use_ln', True,
+                     'Whether to use layer normalisation in the processor.')
+flags.DEFINE_boolean('use_lstm', False,
+                     'Whether to insert an LSTM after message passing.')
+flags.DEFINE_integer('nb_triplet_fts', 8,
+                     'How many triplet features to compute?')
+flags.DEFINE_enum('encoder_init', 'xavier_on_scalars',
+                  ['default', 'xavier_on_scalars'],
+                  'Initialiser to use for the encoders.')
+flags.DEFINE_enum('processor_type', 'triplet_gmpnn',
+                  ['deepsets', 'mpnn', 'pgn', 'pgn_mask',
+                   'triplet_mpnn', 'triplet_pgn', 'triplet_pgn_mask',
+                   'gat', 'gatv2', 'gat_full', 'gatv2_full',
+                   'gpgn', 'gpgn_mask', 'gmpnn',
+                   'triplet_gpgn', 'triplet_gpgn_mask', 'triplet_gmpnn'],
+                  'Processor type to use as the network P.')
+flags.DEFINE_string('checkpoint_path', '../env/checkpoints',
+                    'Path in which checkpoints are saved.')
+flags.DEFINE_string('dataset_path', '/tmp/CLRS30',
+                    'Path in which dataset is stored.')
+flags.DEFINE_boolean('freeze_processor', False,
+                     'Whether to freeze the processor of the model.')
+FLAGS = flags.FLAGS
+PRED_AS_INPUT_ALGOS = [
+    'binary_search',
+    'minimum',
+    'find_maximum_subarray',
+    'find_maximum_subarray_kadane',
+    'matrix_chain_order',
+    'lcs_length',
+    'optimal_bst',
+    'activity_selector',
+    'task_scheduling',
+    'naive_string_matcher',
+    'kmp_matcher',
+    'jarvis_march']
+def unpack(v):
+  try:
+    return v.item()  # DeviceArray
+  except (AttributeError, ValueError):
+    return v
+def _iterate_sampler(sampler, batch_size):
+  while True:
+    yield sampler.next(batch_size)
+def _maybe_download_dataset(dataset_path):
+  """Download CLRS30 dataset if needed."""
+  dataset_folder = os.path.join(dataset_path, clrs.get_clrs_folder())
+  if os.path.isdir(dataset_folder):
+    logging.info('Dataset found at %s. Skipping download.', dataset_folder)
+    return dataset_folder
+  logging.info('Dataset not found in %s. Downloading...', dataset_folder)
+  clrs_url = clrs.get_dataset_gcp_url()
+  request = requests.get(clrs_url, allow_redirects=True)
+  clrs_file = os.path.join(dataset_path, os.path.basename(clrs_url))
+  os.makedirs(dataset_folder)
+  open(clrs_file, 'wb').write(request.content)
+  shutil.unpack_archive(clrs_file, extract_dir=dataset_folder)
+  os.remove(clrs_file)
+  return dataset_folder
+def make_sampler(length: int,
+                 rng: Any,
+                 algorithm: str,
+                 split: str,
+                 batch_size: int,
+                 multiplier: int,
+                 randomize_pos: bool,
+                 enforce_pred_as_input: bool,
+                 enforce_permutations: bool,
+                 chunked: bool,
+                 chunk_length: int,
+                 sampler_kwargs: Dict[str, Any]):
+  """Create a sampler with given options.
+  Args:
+    length: Size of samples (i.e., number of nodes in the graph).
+      A length of -1 will mean that the benchmark
+      dataset (for the given split) is used. Positive sizes will instantiate
+      samplers of the corresponding size.
+    rng: Numpy random state.
+    algorithm: The name of the algorithm to sample from.
+    split: 'train', 'val' or 'test'.
+    batch_size: Samples per batch.
+    multiplier: Integer multiplier for the number of samples in the dataset,
+      only used for positive sizes. Negative multiplier means infinite samples.
+    randomize_pos: Whether to randomize the `pos` input.
+    enforce_pred_as_input: Whether to convert fixed pred_h hints to inputs.
+    enforce_permutations: Whether to enforce permutation pointers.
+    chunked: Whether to chunk the dataset.
+    chunk_length: Unroll length of chunks, if `chunked` is True.
+    sampler_kwargs: Extra args passed to the sampler.
+  Returns:
+    A sampler (iterator), the number of samples in the iterator (negative
+    if infinite samples), and the spec.
+  """
+  if length < 0:  # load from file
+    dataset_folder = _maybe_download_dataset(FLAGS.dataset_path)
+    sampler, num_samples, spec = clrs.create_dataset(folder=dataset_folder,
+                                                     algorithm=algorithm,
+                                                     batch_size=batch_size,
+                                                     split=split)
+    sampler = sampler.as_numpy_iterator()
+  else:
+    num_samples = clrs.CLRS30[split]['num_samples'] * multiplier
+    sampler, spec = clrs.build_sampler(
+        algorithm,
+        seed=rng.randint(2**32),
+        num_samples=num_samples,
+        length=length,
+        **sampler_kwargs,
+        )
+    sampler = _iterate_sampler(sampler, batch_size)
+  if randomize_pos:
+    sampler = clrs.process_random_pos(sampler, rng)
+  if enforce_pred_as_input and algorithm in PRED_AS_INPUT_ALGOS:
+    spec, sampler = clrs.process_pred_as_input(spec, sampler)
+  spec, sampler = clrs.process_permutations(spec, sampler, enforce_permutations)
+  if chunked:
+    sampler = clrs.chunkify(sampler, chunk_length)
+  return sampler, num_samples, spec
+def make_multi_sampler(sizes, rng, **kwargs):
+  """Create a sampler with cycling sample sizes."""
+  ss = []
+  tot_samples = 0
+  for length in sizes:
+    sampler, num_samples, spec = make_sampler(length, rng, **kwargs)
+    ss.append(sampler)
+    tot_samples += num_samples
+  def cycle_samplers():
+    while True:
+      for s in ss:
+        yield next(s)
+  return cycle_samplers(), tot_samples, spec
+def _concat(dps, axis):
+  return jax.tree_util.tree_map(lambda *x: np.concatenate(x, axis), *dps)
+def collect_and_eval(sampler, predict_fn, sample_count, rng_key, extras):
+  """Collect batches of output and hint preds and evaluate them."""
+  processed_samples = 0
+  preds = []
+  outputs = []
+  while processed_samples < sample_count:
+    feedback = next(sampler)
+    batch_size = feedback.outputs[0].data.shape[0]
+    outputs.append(feedback.outputs)
+    new_rng_key, rng_key = jax.random.split(rng_key)
+    cur_preds, _ = predict_fn(new_rng_key, feedback.features)
+    preds.append(cur_preds)
+    processed_samples += batch_size
+  outputs = _concat(outputs, axis=0)
+  preds = _concat(preds, axis=0)
+  out = clrs.evaluate(outputs, preds)
+  if extras:
+    out.update(extras)
+  return {k: unpack(v) for k, v in out.items()}
+def create_samplers(rng, train_lengths: List[int]):
+  """Create all the samplers."""
+  train_samplers = []
+  val_samplers = []
+  val_sample_counts = []
+  test_samplers = []
+  test_sample_counts = []
+  spec_list = []
+  for algo_idx, algorithm in enumerate(FLAGS.algorithms):
+    # Make full dataset pipeline run on CPU (including prefetching).
+    with tf.device('/cpu:0'):
+      if algorithm in ['naive_string_matcher', 'kmp_matcher']:
+        # Fixed haystack + needle; variability will be in needle
+        # Still, for chunked training, we maintain as many samplers
+        # as train lengths, since, for each length there is a separate state,
+        # and we must keep the 1:1 relationship between states and samplers.
+        max_length = max(train_lengths)
+        if max_length > 0:  # if < 0, we are using the benchmark data
+          max_length = (max_length * 5) // 4
+        train_lengths = [max_length]
+        if FLAGS.chunked_training:
+          train_lengths = train_lengths * len(train_lengths)
+      logging.info('Creating samplers for algo %s', algorithm)
+      p = tuple([0.1 + 0.1 * i for i in range(9)])
+      if p and algorithm in ['articulation_points', 'bridges',
+                             'mst_kruskal', 'bipartite_matching']:
+        # Choose a lower connection probability for the above algorithms,
+        # otherwise trajectories are very long
+        p = tuple(np.array(p) / 2)
+      length_needle = FLAGS.length_needle
+      sampler_kwargs = dict(p=p, length_needle=length_needle)
+      if length_needle == 0:
+        sampler_kwargs.pop('length_needle')
+      common_sampler_args = dict(
+          algorithm=FLAGS.algorithms[algo_idx],
+          rng=rng,
+          enforce_pred_as_input=FLAGS.enforce_pred_as_input,
+          enforce_permutations=FLAGS.enforce_permutations,
+          chunk_length=FLAGS.chunk_length,
+          )
+      train_args = dict(sizes=train_lengths,
+                        split='train',
+                        batch_size=FLAGS.batch_size,
+                        multiplier=-1,
+                        randomize_pos=FLAGS.random_pos,
+                        chunked=FLAGS.chunked_training,
+                        sampler_kwargs=sampler_kwargs,
+                        **common_sampler_args)
+      train_sampler, _, spec = make_multi_sampler(**train_args)
+      mult = clrs.CLRS_30_ALGS_SETTINGS[algorithm]['num_samples_multiplier']
+      val_args = dict(sizes=[np.amax(train_lengths)],
+                      split='val',
+                      batch_size=32,
+                      multiplier=2 * mult,
+                      randomize_pos=FLAGS.random_pos,
+                      chunked=False,
+                      sampler_kwargs=sampler_kwargs,
+                      **common_sampler_args)
+      val_sampler, val_samples, spec = make_multi_sampler(**val_args)
+      test_args = dict(sizes=[-1],
+                       split='test',
+                       batch_size=32,
+                       multiplier=2 * mult,
+                       randomize_pos=False,
+                       chunked=False,
+                       sampler_kwargs={},
+                       **common_sampler_args)
+      test_sampler, test_samples, spec = make_multi_sampler(**test_args)
+    spec_list.append(spec)
+    train_samplers.append(train_sampler)
+    val_samplers.append(val_sampler)
+    val_sample_counts.append(val_samples)
+    test_samplers.append(test_sampler)
+    test_sample_counts.append(test_samples)
+  return (train_samplers,
+          val_samplers, val_sample_counts,
+          test_samplers, test_sample_counts,
+          spec_list)
+def get_score(submission_folder):
+  FLAGS(["eval.py"])
+  if FLAGS.hint_mode == 'encoded_decoded':
+    encode_hints = True
+    decode_hints = True
+  elif FLAGS.hint_mode == 'decoded_only':
+    encode_hints = False
+    decode_hints = True
+  elif FLAGS.hint_mode == 'none':
+    encode_hints = False
+    decode_hints = False
+  else:
+    raise ValueError('Hint mode not in {encoded_decoded, decoded_only, none}.')
+  train_lengths = [int(x) for x in FLAGS.train_lengths]
+  rng = np.random.RandomState(FLAGS.seed)
+  rng_key = jax.random.PRNGKey(rng.randint(2**32))
+  checkpoint_path = os.path.join(submission_folder, 'checkpoints')
+  spec_list = pickle.load(open(os.path.join(checkpoint_path, 'spec_list.pkl'), 'rb'))
+  # Create samplers
+  (train_samplers,
+   val_samplers, val_sample_counts,
+   test_samplers, test_sample_counts,
+   spec_list) = create_samplers(rng, train_lengths)
+  # load spec_list
+  model_params = pickle.load(open(os.path.join(checkpoint_path, 'model_params.pkl'), 'rb'))
+  processor_type, use_ln, nb_triplet_fts, nb_heads = model_params["processor_factory"]
+  model_params["processor_factory"] = clrs.get_processor_factory(
+      processor_type,
+      use_ln=use_ln,
+      nb_triplet_fts=nb_triplet_fts,
+      nb_heads=nb_heads
+  )
+  model_params["checkpoint_path"]=checkpoint_path
+  eval_model = BaselineModel(
+      spec=spec_list,
+      dummy_trajectory=[next(t) for t in val_samplers],
+      **model_params
+  )
+  feedback_list = [next(t) for t in train_samplers]
+  # Initialize model.
+  all_features = [f.features for f in feedback_list]
+  eval_model.init(all_features, FLAGS.seed + 1)
+  logging.set_verbosity(logging.INFO)
+  logging.info('Restoring best model from checkpoint...')
+  eval_model.restore_model('best.pkl', only_load_processor=False)
+  for algo_idx in range(len(train_samplers)):
+    new_rng_key, rng_key = jax.random.split(rng_key)
+    val_stats = collect_and_eval(
+        val_samplers[algo_idx],
+        functools.partial(eval_model.predict, algorithm_index=algo_idx),
+        val_sample_counts[algo_idx],
+        new_rng_key,
+        extras = {})
+    # logging.info('(val) algo %s: %s', FLAGS.algorithms[algo_idx], val_stats)
+    new_rng_key, rng_key = jax.random.split(rng_key)
+    test_stats = collect_and_eval(
+        test_samplers[algo_idx],
+        functools.partial(eval_model.predict, algorithm_index=algo_idx),
+        test_sample_counts[algo_idx],
+        new_rng_key,
+        extras = {})
+    # logging.info('(test) algo %s : %s', FLAGS.algorithms[algo_idx], test_stats)
+    return test_stats['score']
+if __name__ == '__main__':
+  app.run(get_score)

benchmarks/CLRS/scripts/requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+absl-py>=0.13.0
+attrs>=21.4.0
+chex>=0.0.8
+dm-haiku>=0.0.4
+jax>=0.2.18
+jaxlib>=0.1.69
+numpy>=1.21.1
+opt-einsum>=3.3.0
+optax>=0.0.9
+six>=1.16.0
+tensorflow>=2.9.0
+tfds-nightly==4.5.2.dev202204190046
+toolz>=0.11.1

benchmarks/CLRS/scripts/research_problem.txt ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ Improve the baseline model performance on the task floyd_warshall in The CLRS Algorithmic Reasoning Benchmark. The dataset description is available in data_description.txt, and the baseline model architecture description is available in baseline_model_description.txt. To run the baseline model, execute train.py. Note that the core message passing function of the baseline model is implemented in function get_triplet_msgs (L301 in processors.py). You can modify this function to improve the baseline model performance. You can also modify other parts of the baseline model and training script to improve its performance, as long as the final model is still loadable by calling BaselineModel class as in L415 in train.py.
2	+
3	+ When you submit your final answer, you will be evaluated on the performance of the checkpoint checkpoints/best.pkl saved by train.py. Note that the final model must still be loadable by calling BaselineModel class as in L415 in train.py and with the saved spec_list.pkl and model_params.pkl.

benchmarks/CLRS/scripts/source_code.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://github.com/deepmind/clrs/blob/master/clrs/examples/run.py

benchmarks/amp-parkinsons-disease-progression-prediction/env/data_description.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+Dataset Description
+The goal of this competition is to predict the course of Parkinson's disease (PD) using protein abundance data. The complete set of proteins involved in PD remains an open research question and any proteins that have predictive value are likely worth investigating further. The core of the dataset consists of protein abundance values derived from mass spectrometry readings of cerebrospinal fluid (CSF) samples gathered from several hundred patients. Each patient contributed several samples over the course of multiple years while they also took assessments of PD severity.
+This is a time-series code competition: you will receive test set data and make predictions with a time-series API. See the evaluation_details.txt for details.
+Files
+train_peptides.csv Mass spectrometry data at the peptide level. Peptides are the component subunits of proteins.
+visit_id - ID code for the visit.
+visit_month - The month of the visit, relative to the first visit by the patient.
+patient_id - An ID code for the patient.
+UniProt - The UniProt ID code for the associated protein. There are often several peptides per protein.
+Peptide - The sequence of amino acids included in the peptide. See this table for the relevant codes. Some rare annotations may not be included in the table. The test set may include peptides not found in the train set.
+PeptideAbundance - The frequency of the amino acid in the sample.
+train_proteins.csv Protein expression frequencies aggregated from the peptide level data.
+visit_id - ID code for the visit.
+visit_month - The month of the visit, relative to the first visit by the patient.
+patient_id - An ID code for the patient.
+UniProt - The UniProt ID code for the associated protein. There are often several peptides per protein. The test set may include proteins not found in the train set.
+NPX - Normalized protein expression. The frequency of the protein's occurrence in the sample. May not have a 1:1 relationship with the component peptides as some proteins contain repeated copies of a given peptide.
+train_clinical_data.csv
+visit_id - ID code for the visit.
+visit_month - The month of the visit, relative to the first visit by the patient.
+patient_id - An ID code for the patient.
+updrs_[1-4] - The patient's score for part N of the Unified Parkinson's Disease Rating Scale. Higher numbers indicate more severe symptoms. Each sub-section covers a distinct category of symptoms, such as mood and behavior for Part 1 and motor functions for Part 3.
+upd23b_clinical_state_on_medication - Whether or not the patient was taking medication such as Levodopa during the UPDRS assessment. Expected to mainly affect the scores for Part 3 (motor function). These medications wear off fairly quickly (on the order of one day) so it's common for patients to take the motor function exam twice in a single month, both with and without medication.
+supplemental_clinical_data.csv Clinical records without any associated CSF samples. This data is intended to provide additional context about the typical progression of Parkinsons. Uses the same columns as train_clinical_data.csv.
+example_test_files/ Data intended to illustrate how the API functions. Includes the same columns delivered by the API (ie no updrs columns).
+public_timeseries_testing_util.py A file for running custom API tests.

benchmarks/amp-parkinsons-disease-progression-prediction/env/evaluation_details.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Submissions are evaluated on SMAPE between forecasts and actual values. We define SMAPE = 0 when the actual and predicted values are both 0.
+For each patient visit where a protein/peptide sample was taken you will need to estimate both their UPDRS scores for that visit and predict their scores for any potential visits 6, 12, and 24 months later. Predictions for any visits that didn't ultimately take place are ignored.
+You must submit to this competition using the provided python time-series API, which ensures that models do not peek forward in time. To use the API, follow this template in Kaggle Notebooks:
+from public_timeseries_testing_util import MockApi
+env = MockApi.make_env()   # initialize the environment
+iter_test = env.iter_test()    # an iterator which loops over the test files
+for (test, test_peptides, test_proteins, sample_submission) in iter_test:
+    sample_prediction_df['rating'] = np.arange(len(sample_prediction))  # make your predictions here
+    env.predict(sample_prediction_df)   # register your predictions

benchmarks/amp-parkinsons-disease-progression-prediction/env/public_timeseries_testing_util.py ADDED Viewed

	@@ -0,0 +1,94 @@

+'''
+An unlocked version of the timeseries API intended for testing alternate inputs.
+Mirrors the production timeseries API in the crucial respects, but won't be as fast.
+ONLY works afer the first three variables in MockAPI.__init__ are populated.
+'''
+from typing import Sequence, Tuple
+import pandas as pd
+class MockApi:
+    def __init__(self):
+        '''
+        YOU MUST UPDATE THE FIRST THREE LINES of this method.
+        They've been intentionally left in an invalid state.
+        Variables to set:
+            input_paths: a list of two or more paths to the csv files to be served
+            group_id_column: the column that identifies which groups of rows the API should serve.
+                A call to iter_test serves all rows of all dataframes with the current group ID value.
+            export_group_id_column: if true, the dataframes iter_test serves will include the group_id_column values.
+        '''
+        self.input_paths: Sequence[str] = [
+            'example_test_files/test.csv',
+            'example_test_files/test_peptides.csv',
+            'example_test_files/test_proteins.csv',
+            'example_test_files/sample_submission.csv',
+        ]
+        self.group_id_column: str = 'visit_month'
+        self.export_group_id_column: bool = True
+        # iter_test is only designed to support at least two dataframes, such as test and sample_submission
+        assert len(self.input_paths) >= 2
+        self._status = 'initialized'
+        self.predictions = []
+    def iter_test(self) -> Tuple[pd.DataFrame]:
+        '''
+        Loads all of the dataframes specified in self.input_paths,
+        then yields all rows in those dataframes that equal the current self.group_id_column value.
+        '''
+        if self._status != 'initialized':
+            raise Exception('WARNING: the real API can only iterate over `iter_test()` once.')
+        dataframes = []
+        for pth in self.input_paths:
+            dataframes.append(pd.read_csv(pth, low_memory=False))
+        group_order = dataframes[0][self.group_id_column].drop_duplicates().tolist()
+        dataframes = [df.set_index(self.group_id_column) for df in dataframes]
+        for group_id in group_order:
+            self._status = 'prediction_needed'
+            current_data = []
+            for df in dataframes:
+                try:
+                    cur_df = df.loc[group_id].copy()
+                    # returning single line dataframes from df.loc requires special handling
+                    if not isinstance(cur_df, pd.DataFrame):
+                        cur_df = pd.DataFrame({a: b for a, b in zip(cur_df.index.values, cur_df.values)}, index=[group_id])
+                        cur_df = cur_df.index.rename(self.group_id_column)
+                except KeyError:
+                    cur_df = df.loc[[]].copy()
+                cur_df = cur_df.reset_index(drop=not(self.export_group_id_column))
+                current_data.append(cur_df)
+            yield tuple(current_data)
+            while self._status != 'prediction_received':
+                print('You must call `predict()` successfully before you can continue with `iter_test()`', flush=True)
+                yield None
+        with open('submission.csv', 'w') as f_open:
+            pd.concat(self.predictions).to_csv(f_open, index=False)
+        self._status = 'finished'
+    def predict(self, user_predictions: pd.DataFrame):
+        '''
+        Accepts and stores the user's predictions and unlocks iter_test once that is done
+        '''
+        if self._status == 'finished':
+            raise Exception('You have already made predictions for the full test set.')
+        if self._status != 'prediction_needed':
+            raise Exception('You must get the next test sample from `iter_test()` first.')
+        if not isinstance(user_predictions, pd.DataFrame):
+            raise Exception('You must provide a DataFrame.')
+        self.predictions.append(user_predictions)
+        self._status = 'prediction_received'
+def make_env():
+    return MockApi()

benchmarks/amp-parkinsons-disease-progression-prediction/env/train.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestRegressor
+from public_timeseries_testing_util import MockApi
+from sklearn.metrics import make_scorer
+from sklearn.model_selection import KFold, GroupKFold, cross_val_score
+from sklearn.utils import check_consistent_length
+# Define the metric
+def smapep1(y_true, y_pred):
+    """SMAPE of y+1, a nonnegative float, smaller is better
+    Parameters: y_true, y_pred: array-like
+    Returns 100 for 100 % error.
+    y_true may have missing values.
+    """
+    check_consistent_length(y_true, y_pred)
+    y_true = np.array(y_true, copy=False).ravel()
+    y_pred = np.array(y_pred, copy=False).ravel()
+    y_true, y_pred = y_true[np.isfinite(y_true)], y_pred[np.isfinite(y_true)]
+    if (y_true < 0).any(): raise ValueError('y_true < 0')
+    if (y_pred < 0).any(): raise ValueError('y_pred < 0')
+    denominator = (y_true + y_pred) / 2 + 1
+    ape = np.abs(y_pred - y_true) / denominator
+    return np.average(ape) * 100
+# The scorer returns nonpositive values so that greater is better.
+# It will be used as an argument to cross_val_score
+smapep1_scorer = make_scorer(smapep1, greater_is_better=False)
+def get_predictions(my_train, model):
+    # Forecast
+    my_train = my_train.fillna(0)
+    result   = pd.DataFrame(columns = ['prediction_id', 'rating'])
+    final    = []
+    target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
+    for u in target:
+        # Predict
+        X = my_train["visit_month"]
+        predict          = model[u].predict(X.values.reshape(-1, 1)).tolist()
+        complete_result  = my_train[["visit_id",'visit_month']].values.tolist()
+        for index in range(len(complete_result)):
+            complete_result[index].extend(predict[index])
+        temp = pd.DataFrame(complete_result,
+                            columns = ["visit_id",'visit_month',u +'_plus_0_months',
+                            u +'_plus_6_months',
+                            u +'_plus_12_months',
+                            u +'_plus_24_months'])
+        temp = temp.melt(       id_vars=["visit_id",'visit_month'],
+                value_vars=[ u +'_plus_0_months' , u +'_plus_6_months',
+                             u +'_plus_12_months',u +"_plus_24_months"],
+                                                  value_name = 'rating')
+        temp['prediction_id'] = temp['visit_id'] + '_' + temp['variable']
+        final.append(temp[['prediction_id','rating']])
+    final = pd.concat(final)
+    final = final.drop_duplicates(subset=['prediction_id', 'rating'])
+    return final
+if __name__ == "__main__":
+    target            = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
+    data_proteins     = pd.read_csv('train_proteins.csv')
+    data_clinical     = pd.read_csv('train_clinical_data.csv')
+    data_peptides     = pd.read_csv('train_peptides.csv')
+    data_supplemental = pd.read_csv('supplemental_clinical_data.csv')
+    merged_data = pd.concat([data_clinical, data_supplemental])
+    ## TODO: data cleaning and feature engineering
+    # Right now, we only use the month data and the target data
+    id_list = merged_data['patient_id'].unique().tolist()
+    data_for_train = {}
+    for u in target:
+        final   = []
+        for id_ in id_list:
+            infor_of_id  = merged_data[merged_data['patient_id'] == id_]
+            month_per_id = infor_of_id.visit_month.tolist()
+            for month in month_per_id:
+                check    = [month, id_]
+                for plus in [0,6,12,24]:
+                    if month + plus in month_per_id :
+                        month_value = infor_of_id[infor_of_id.visit_month == month+plus][u].values[0]
+                        if month_value != np.nan:
+                            check.append(month_value)
+                if len(check) == 6:
+                    final.append(check)
+        check = pd.DataFrame(final,columns = ['month', 'patient_id',u+'+0',u+'+6',u+'+12',u+'+24'])
+        data_for_train[u] = check.dropna()
+    ## train model
+    model = {}
+    overall_score = []
+    target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
+    for i, u in enumerate(target):
+        # Train data
+        X = data_for_train[u]['month']
+        y = data_for_train[u].iloc[:,2:6]
+        trained =  RandomForestRegressor().fit(X.values.reshape(-1, 1), y)
+        # Save model
+        model[u] = trained
+        ## cross validation and print results
+        print('Cross-validation scores')
+        cvs = cross_val_score(RandomForestRegressor(),
+                            X=X.values.reshape(-1, 1), y=y,
+                            groups=data_for_train[u]['patient_id'],
+                            scoring=smapep1_scorer,
+                            cv=GroupKFold(n_splits=8),
+                            error_score='raise')
+        print([f'updrs_{i}:'], -cvs.round(1), -cvs.mean().round(1))
+        overall_score.append(-cvs)
+    print(f'Overall cv score of the group model: {np.array(overall_score).mean():.2f}')
+    ## save to submission.csv file for the test set by using this following API call
+    env = MockApi()
+    iter_test = env.iter_test()    # an iterator which loops over the test files
+    # The API will deliver four dataframes in this specific order:
+    for iteration, (test_clinical_data, test_peptides, test_proteins, sample_submission) in enumerate(iter_test):
+        # TODO - make your predictions here by modifying 'rating' sample_submission dataframe
+        pred = get_predictions(test_clinical_data, model).round(0)
+        for index in sample_submission['prediction_id']:
+            sample_submission.loc[sample_submission['prediction_id']==index, 'rating'] = pred[pred['prediction_id']==index]['rating'].values
+        env.predict(sample_submission)   # register your predictions

benchmarks/amp-parkinsons-disease-progression-prediction/scripts/eval.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import sys
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../env"))
+from importlib import reload
+import train
+reload(train)
+import pandas as pd
+from train import smapep1, check_consistent_length
+def get_score(submission_folder = "../env"):
+    submission_path = os.path.join(submission_folder, "submission.csv")
+    solution = pd.read_csv(os.path.join(os.path.dirname(__file__), "answer.csv"))
+    submission = pd.read_csv(submission_path)
+    s = smapep1(solution["rating"], submission["rating"])
+    return s
+if __name__ == "__main__":
+    print(get_score())

benchmarks/amp-parkinsons-disease-progression-prediction/scripts/prepare.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import subprocess
+import pandas as pd
+import random
+import os
+taskname = "amp-parkinsons-disease-progression-prediction"
+download_dir = "../env"
+input(f"Consent to the competition at https://www.kaggle.com/competitions/{taskname}/data; Press any key after you have accepted the rules online.")
+subprocess.run(["kaggle", "competitions", "download", "-c", taskname], cwd=download_dir)
+subprocess.run(["unzip", "-n", f"{taskname}.zip"], cwd=download_dir)
+subprocess.run(["rm", f"{taskname}.zip"], cwd=download_dir)
+subprocess.run(["rm", "-r", "amp_pd_peptide"], cwd=download_dir)
+subprocess.run(["rm", "-r", "amp_pd_peptide_310"], cwd=download_dir)
+# ## split train to train and test in env
+data_proteins     = pd.read_csv(f'{download_dir}/train_proteins.csv')
+data_clinical     = pd.read_csv(f'{download_dir}/train_clinical_data.csv')
+data_peptides     = pd.read_csv(f'{download_dir}/train_peptides.csv')
+data_supplemental = pd.read_csv(f'{download_dir}/supplemental_clinical_data.csv')
+random.seed(42)
+patient_id = data_clinical['patient_id'].unique()
+test_patient_id = random.sample(patient_id.tolist(), 2)
+train_patient_id = [x for x in patient_id if x not in test_patient_id]
+data_proteins[data_proteins['patient_id'].isin(train_patient_id)].to_csv(f'{download_dir}/train_proteins.csv', index=False)
+data_clinical[data_clinical['patient_id'].isin(train_patient_id)].to_csv(f'{download_dir}/train_clinical_data.csv', index=False)
+data_peptides[data_peptides['patient_id'].isin(train_patient_id)].to_csv(f'{download_dir}/train_peptides.csv', index=False)
+data_supplemental[data_supplemental['patient_id'].isin(train_patient_id)].to_csv(f'{download_dir}/supplemental_clinical_data.csv', index=False)
+data_proteins[data_proteins['patient_id'].isin(test_patient_id)].to_csv(f'{download_dir}/example_test_files/test_proteins.csv', index=False)
+data_peptides[data_peptides['patient_id'].isin(test_patient_id)].to_csv(f'{download_dir}/example_test_files/test_peptides.csv', index=False)
+test_clinical = data_clinical[data_clinical['patient_id'].isin(test_patient_id)]
+# Create test.csv
+temp_list = []
+for i in range(1, 5):
+    temp = test_clinical.copy()
+    temp['level_3'] = i
+    temp['updrs_test'] = f'updrs_{i}'
+    temp_list.append(temp)
+mock_train = pd.concat(temp_list)
+mock_train['row_id'] = (mock_train[['patient_id', 'visit_month', 'level_3']]
+                      .apply((lambda r: f"{r.patient_id}_{int(r.visit_month)}_updrs_{r.level_3}"), axis=1))
+mock_train[['visit_id', 'patient_id', 'visit_month','row_id', 'updrs_test']].to_csv(f'{download_dir}/example_test_files/test.csv', index=False)
+# Create sample_submission.csv
+temp_list = []
+for wait in [0, 6, 12, 24]:
+    temp = mock_train.copy()
+    temp['wait'] = wait
+    temp_list.append(temp)
+y = pd.concat(temp_list)
+y = y[y.visit_month + y.wait <= 108]
+y['prediction_id'] = (y[['patient_id', 'visit_month', 'wait', 'level_3']]
+                      .apply((lambda r: f"{r.patient_id}_{int(r.visit_month)}_updrs_{r.level_3}_plus_{r.wait}_months"), axis=1))
+def get_rating(row):
+    rating = test_clinical[test_clinical["visit_id"] == f'{row.patient_id}_{int(row.visit_month) + int(row.wait) }' ][f'updrs_{row.level_3}']
+    if len(rating) == 0:
+        return None
+    return rating.item()
+y['rating'] = (y[['patient_id', 'visit_month', 'wait', 'level_3']].apply(get_rating, axis=1))
+y = y.dropna()
+y[['prediction_id', 'rating', 'visit_month']].to_csv(f'answer.csv', index=False)
+y['rating'] = 0
+y[['prediction_id', 'rating', 'visit_month']].to_csv(f'{download_dir}/example_test_files/sample_submission.csv', index=False)

benchmarks/amp-parkinsons-disease-progression-prediction/scripts/read_only_files.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+example_test_files/*
+./supplemental_clinical_data.csv
+./train_clinical_data.csv
+./train_peptide.csv
+./train_protein.csv

benchmarks/amp-parkinsons-disease-progression-prediction/scripts/research_problem.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+Go through the data_description.txt file to understand the data and the machine learning task. You can summarize it in your research logs to keep track of what all you have to do.
+Then fill in the provided train.py script to train a model and iterate over different models or feature selections to get a better performance (for SMAPE score the lower is better). Finally, you should submit the predictions of your best model for the test set as a submission.csv as described in the evaluation_details.txt file.
+Never try to read any csv files directly. Do not forget to execute the changes you made to check for performance.

benchmarks/amp-parkinsons-disease-progression-prediction/scripts/source_code.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ https://www.kaggle.com/code/dangkhanhle/test-model
2	+ https://www.kaggle.com/code/ambrosm/pdpp-linear-and-isotonic-groups/notebook

benchmarks/babylm/env/babyLM_for_hf.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import datasets
+_CITATION = """
+"""
+_DESCRIPTION = """\
+BabyLM data
+"""
+_HOMEPAGE = "https://babylm.github.io/"
+_LICENSE = "????"
+_DATA_URL = "./babylm_data"
+class babyLMConfig(datasets.BuilderConfig):
+    """BuilderConfig for babyLM."""
+    def __init__(self, data_url, **kwargs):
+        """BuilderConfig for babyLM
+        Args:
+          data_url: `string`, url to the dataset (word or raw level)
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super().__init__(
+            version=datasets.Version(
+                "1.0.0",
+            ),
+            **kwargs,
+        )
+        self.data_url = data_url
+class babyLM(datasets.GeneratorBasedBuilder):
+    """TODO: Short description of dataset dataset."""
+    DATA_SOURCES = [
+            'aochildes', 'bnc_spoken', 'cbt', 'children_stories',
+            'gutenberg', 'open_subtitles', 'qed',  'simple_wikipedia',
+            'switchboard',  'wikipedia']
+    VERSION = datasets.Version("0.0.0")
+    BUILDER_CONFIGS = [
+            babyLMConfig(
+                name="babyLM-10M",
+                data_url=os.path.join(_DATA_URL, 'babylm_10M'),
+                description="Raw level dataset: the raw tokens before the addition of <unk> tokens. 10M tokens.",
+            ),
+            babyLMConfig(
+                name="babyLM-100M",
+                data_url=os.path.join(_DATA_URL, 'babylm_100M'),
+                description="Raw level dataset: the raw tokens before the addition of <unk> tokens. 100M tokens.",
+            ),
+            ]
+    def _info(self):
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # datasets.features.FeatureConnectors
+            features=datasets.Features(
+                {
+                    "text": datasets.Value("string")
+                    # These are the features of your dataset like images, labels ...
+                }
+            ),
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        ret_list = [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"data_folder": os.path.join(_DATA_URL, "babylm_test"), "split": "test"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"data_folder": os.path.join(_DATA_URL, "babylm_dev"), "split": "dev"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"data_folder": self.config.data_url, "split": "train"},
+            ),
+        ]
+        return ret_list
+    def _generate_examples(self, data_folder, split):
+        """Yields examples."""
+        all_data_files = [
+                os.path.join(data_folder, f'{source}.{split}')
+                for source in self.DATA_SOURCES]
+        all_lines = []
+        for data_file in all_data_files:
+            with open(data_file, encoding="utf-8") as f:
+                all_lines.extend(f.readlines())
+        for idx, row in enumerate(all_lines):
+            if row.strip():
+                yield idx, {"text": row}
+            else:
+                yield idx, {"text": ""}

benchmarks/babylm/env/train.py ADDED Viewed

	@@ -0,0 +1,641 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=text-generation
+"""
+# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
+import logging
+import math
+import os
+# disable logging until training starts
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import sys
+from dataclasses import dataclass, field
+from itertools import chain
+from typing import Optional
+import datasets
+import evaluate
+import torch
+from datasets import load_dataset
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    is_torch_tpu_available,
+    set_seed,
+)
+from transformers.testing_utils import CaptureLogger
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+            )
+        },
+    )
+    model_type: Optional[str] = field(
+        default="gpt2",
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_overrides: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Override some existing default config settings when a model is trained from scratch. Example: "
+                "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+            )
+        },
+    )
+    config_name: Optional[str] = field(
+        default="gpt2", metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default="gpt2", metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
+                "with private models)."
+            )
+        },
+    )
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
+                "dtype will be automatically derived from the model's weights."
+            ),
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    low_cpu_mem_usage: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+                "set True will benefit LLM loading time and RAM consumption."
+            )
+        },
+    )
+    def __post_init__(self):
+        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
+            raise ValueError(
+                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
+            )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default="babyLM_for_hf.py", metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default="babyLM-10M", metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=200,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
+    block_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Optional input sequence length after tokenization. "
+                "The training dataset will be truncated in block of this size for training. "
+                "Default to the model max input length for single sentence inputs (take into account special tokens)."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    keep_linebreaks: bool = field(
+        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
+    )
+    def __post_init__(self):
+        if self.streaming:
+            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        if "--output_dir" not in sys.argv:
+            sys.argv.append("--output_dir")
+            sys.argv.append("./output")
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # by default we do both training and evaluation
+    training_args.do_train = True if not "--do_train" in sys.argv else training_args.do_train
+    training_args.do_eval = True if not "--do_eval" in sys.argv else training_args.do_eval
+    training_args.overwrite_output_dir = True if not "--overwrite_output_dir" in sys.argv else training_args.overwrite_output_dir
+    training_args.report_to = [] if not "--report_to" in sys.argv else training_args.report_to
+    training_args.log_level = "critical" if not "--log_level" in sys.argv else training_args.log_level
+    training_args.num_train_epochs = 1 if not "--num_train_epochs" in sys.argv else training_args.num_train_epochs
+    training_args.evaluation_strategy = "steps" if not "--evaluation_strategy" in sys.argv else training_args.evaluation_strategy
+    training_args.eval_steps = 0.2 if not "--eval_steps" in sys.argv else training_args.eval_steps
+    training_args.per_device_train_batch_size = 16 if not "--per_device_train_batch_size" in sys.argv else training_args.per_device_train_batch_size
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_clm", model_args, data_args)
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+            streaming=data_args.streaming,
+        )
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                streaming=data_args.streaming,
+            )
+            raw_datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                streaming=data_args.streaming,
+            )
+    else:
+        data_files = {}
+        dataset_args = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = (
+            data_args.train_file.split(".")[-1]
+            if data_args.train_file is not None
+            else data_args.validation_file.split(".")[-1]
+        )
+        if extension == "txt":
+            extension = "text"
+            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
+        raw_datasets = load_dataset(
+            extension,
+            data_files=data_files,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+            **dataset_args,
+        )
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                **dataset_args,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                **dataset_args,
+            )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config_kwargs = {
+        "cache_dir": model_args.cache_dir,
+        "revision": model_args.model_revision,
+        "use_auth_token": True if model_args.use_auth_token else None,
+    }
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+        if model_args.config_overrides is not None:
+            logger.info(f"Overriding config: {model_args.config_overrides}")
+            config.update_from_string(model_args.config_overrides)
+            logger.info(f"New config: {config}")
+    tokenizer_kwargs = {
+        "cache_dir": model_args.cache_dir,
+        "use_fast": model_args.use_fast_tokenizer,
+        "revision": model_args.model_revision,
+        "use_auth_token": True if model_args.use_auth_token else None,
+    }
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    if model_args.model_name_or_path:
+        torch_dtype = (
+            model_args.torch_dtype
+            if model_args.torch_dtype in ["auto", None]
+            else getattr(torch, model_args.torch_dtype)
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+            torch_dtype=torch_dtype,
+            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
+        )
+    else:
+        model = AutoModelForCausalLM.from_config(config)
+        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
+        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = list(raw_datasets["train"].features)
+    else:
+        column_names = list(raw_datasets["validation"].features)
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
+    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+    def tokenize_function(examples):
+        with CaptureLogger(tok_logger) as cl:
+            output = tokenizer(examples[text_column_name])
+        # clm input could be much much longer than block_size
+        if "Token indices sequence length is longer than the" in cl.out:
+            tok_logger.warning(
+                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
+                " before being passed to the model."
+            )
+        return output
+    with training_args.main_process_first(desc="dataset map tokenization"):
+        if not data_args.streaming:
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on dataset",
+            )
+        else:
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                remove_columns=column_names,
+            )
+    if data_args.block_size is None:
+        block_size = tokenizer.model_max_length
+        if block_size > 1024:
+            logger.warning(
+                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
+                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+                " override this default with `--block_size xxx`."
+            )
+            block_size = 1024
+    else:
+        if data_args.block_size > tokenizer.model_max_length:
+            logger.warning(
+                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
+                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
+            )
+        block_size = min(data_args.block_size, tokenizer.model_max_length)
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
+    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
+    # to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    with training_args.main_process_first(desc="grouping texts together"):
+        if not data_args.streaming:
+            lm_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc=f"Grouping texts in chunks of {block_size}",
+            )
+        else:
+            lm_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+            )
+    if training_args.do_train:
+        if "train" not in tokenized_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = lm_datasets["train"]
+        if data_args.max_train_samples is not None:
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+    if training_args.do_eval:
+        if "validation" not in tokenized_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = lm_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+        def preprocess_logits_for_metrics(logits, labels):
+            if isinstance(logits, tuple):
+                # Depending on the model and config, logits may contain extra tensors,
+                # like past_key_values, but logits always come first
+                logits = logits[0]
+            return logits.argmax(dim=-1)
+        metric = evaluate.load("accuracy")
+        def compute_metrics(eval_preds):
+            preds, labels = eval_preds
+            # preds have the same shape as the labels, after the argmax(-1) has been calculated
+            # by preprocess_logits_for_metrics but we need to shift the labels
+            labels = labels[:, 1:].reshape(-1)
+            preds = preds[:, :-1].reshape(-1)
+            return metric.compute(predictions=preds, references=labels)
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        # Data collator will default to DataCollatorWithPadding, so we change it.
+        data_collator=default_data_collator,
+        compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
+        preprocess_logits_for_metrics=preprocess_logits_for_metrics
+        if training_args.do_eval and not is_torch_tpu_available()
+        else None,
+    )
+    transformers.utils.logging.set_verbosity(transformers.utils.logging.WARNING)
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+        try:
+            perplexity = math.exp(metrics["eval_loss"])
+        except OverflowError:
+            perplexity = float("inf")
+        metrics["perplexity"] = perplexity
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+if __name__ == "__main__":
+    main()

benchmarks/babylm/scripts/eval.py ADDED Viewed

	@@ -0,0 +1,212 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=text-generation
+"""
+# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
+import logging
+import math
+import os
+# disable logging until training starts
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import sys
+from dataclasses import dataclass, field
+from itertools import chain
+from typing import Optional
+import datasets
+import evaluate
+import torch
+from datasets import load_dataset
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    is_torch_tpu_available,
+    set_seed,
+)
+from transformers.testing_utils import CaptureLogger
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+from transformers import AutoModel, AutoTokenizer
+from datasets import load_dataset
+from transformers.testing_utils import CaptureLogger
+from itertools import chain
+logger = logging.getLogger(__name__)
+def get_score(submission_folder = "../env"):
+    training_args = TrainingArguments("test_trainer")
+    training_args.report_to = []
+    raw_datasets = load_dataset(submission_folder + "/babyLM_for_hf.py", "babyLM-10M", split="test")
+    model = AutoModelForCausalLM.from_pretrained(submission_folder + "/output/")
+    tokenizer = AutoTokenizer.from_pretrained(submission_folder + "/output/")
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    column_names = list(raw_datasets.features)
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
+    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+    def tokenize_function(examples):
+        with CaptureLogger(tok_logger) as cl:
+            output = tokenizer(examples[text_column_name])
+        # clm input could be much much longer than block_size
+        if "Token indices sequence length is longer than the" in cl.out:
+            tok_logger.warning(
+                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
+                " before being passed to the model."
+            )
+        return output
+    with training_args.main_process_first(desc="dataset map tokenization"):
+    # if not data_args.streaming:
+    #     tokenized_datasets = raw_datasets.map(
+    #         tokenize_function,
+    #         batched=True,
+    #         num_proc=data_args.preprocessing_num_workers,
+    #         remove_columns=column_names,
+    #         load_from_cache_file=not data_args.overwrite_cache,
+    #         desc="Running tokenizer on dataset",
+    #     )
+    # else:
+        tokenized_datasets = raw_datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=column_names,
+        )
+    if True:
+        block_size = tokenizer.model_max_length
+        if block_size > 1024:
+            logger.warning(
+                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
+                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+                " override this default with `--block_size xxx`."
+            )
+            block_size = 1024
+    else:
+        if data_args.block_size > tokenizer.model_max_length:
+            logger.warning(
+                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
+                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
+            )
+        block_size = min(data_args.block_size, tokenizer.model_max_length)
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
+    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
+    # to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    with training_args.main_process_first(desc="grouping texts together"):
+        # if not data_args.streaming:
+        #     lm_datasets = tokenized_datasets.map(
+        #         group_texts,
+        #         batched=True,
+        #         num_proc=data_args.preprocessing_num_workers,
+        #         load_from_cache_file=not data_args.overwrite_cache,
+        #         desc=f"Grouping texts in chunks of {block_size}",
+        #     )
+        # else:
+        lm_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+        )
+    eval_dataset = lm_datasets
+    def preprocess_logits_for_metrics(logits, labels):
+        if isinstance(logits, tuple):
+            # Depending on the model and config, logits may contain extra tensors,
+            # like past_key_values, but logits always come first
+            logits = logits[0]
+        return logits.argmax(dim=-1)
+    metric = evaluate.load("accuracy")
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        # preds have the same shape as the labels, after the argmax(-1) has been calculated
+        # by preprocess_logits_for_metrics but we need to shift the labels
+        labels = labels[:, 1:].reshape(-1)
+        preds = preds[:, :-1].reshape(-1)
+        return metric.compute(predictions=preds, references=labels)
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=None,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        # Data collator will default to DataCollatorWithPadding, so we change it.
+        data_collator=default_data_collator,
+        compute_metrics=compute_metrics,
+        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+    )
+    transformers.utils.logging.set_verbosity(transformers.utils.logging.WARNING)
+    # Evaluation
+    metrics = trainer.evaluate()
+    max_eval_samples = len(eval_dataset)
+    metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+    try:
+        perplexity = math.exp(metrics["eval_loss"])
+    except OverflowError:
+        perplexity = float("inf")
+    metrics["perplexity"] = perplexity
+    return perplexity
+if __name__ == "__main__":
+    print(get_score())

benchmarks/babylm/scripts/prepare.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import subprocess
+import pandas as pd
+taskname = "babylm"
+download_dir = "../env"
+subprocess.run(["wget", "https://github.com/babylm/babylm.github.io/raw/main/babylm_data.zip"], cwd=download_dir)
+subprocess.run(["unzip", "-n", f"babylm_data.zip"], cwd=download_dir)
+subprocess.run(["rm", f"babylm_data.zip"], cwd=download_dir)
+subprocess.run(["rm", "-rf", f"babylm_data/babylm_100M"], cwd=download_dir)
+subprocess.run(["rm", "-rf", f"__MACOSX"], cwd=download_dir)

benchmarks/babylm/scripts/read_only_files.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ babylm_data/*
2	+ __MACOSX/*

benchmarks/babylm/scripts/research_problem.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+Improve the baseline model performance on the babyLM Benchmark.
+Summary: This shared task challenges community members to train a language model **from scratch** on the same amount of linguistic data available to a child. Submissions should be implemented in Huggingface's Transformers library and will be evaluated on a shared pipeline. This shared task is co-sponsored by CMCL and CoNLL.
+To run the baseline model, execute train.py. It will train a standard gpt2 model on the babyLM data. The final model will be saved to output/ folder.
+When you submit your final answer, you will be evaluated on the performance of the checkpoint saved in the output folder. It will be evaluated on a held-out test set.

benchmarks/bibtex-generation/env/arxiv_API_reference.txt ADDED Viewed

	@@ -0,0 +1,599 @@

+arXiv API User's Manual
+Please review the Terms of Use for arXiv APIs before using the arXiv API.
+Table of Contents
+1. Preface
+2. API QuickStart
+3. Structure of the API
+3.1. Calling the API
+3.1.1. Query Interface
+3.1.1.1. search_query and id_list logic
+3.1.1.2. start and max_results paging
+3.1.1.3. sort order for return results
+3.2. The API Response
+3.3. Outline of an Atom feed
+3.3.1. Feed Metadata
+3.3.1.1. <title>, <id>, <link> and <updated>
+3.3.1.2. OpenSearch Extension Elements
+3.3.2. Entry Metadata
+3.3.2.1. <title>, <id>, <published>, and <updated>
+3.3.2.1. <summary>, <author> and <category>
+3.3.2.3. <link>'s
+3.3.2.4. <arxiv> extension elements
+3.4. Errors
+4. Examples
+4.1. Simple Examples
+4.1.1. Perl
+4.1.2. Python
+4.1.3. Ruby
+4.1.4. PHP
+4.2. Detailed Parsing Examples
+5. Appendices
+5.1. Details of Query Construction
+5.1.1. A Note on Article Versions
+5.2. Details of Atom Results Returned
+5.3. Subject Classifications
+1. Preface
+The arXiv API allows programmatic access to the hundreds of thousands of e-prints hosted on arXiv.org.
+This manual is meant to provide an introduction to using the API, as well as documentation describing its details, and as such is meant to be read by both beginning and advanced users. To get a flavor for how the API works, see the API Quickstart. For more detailed information, see Structure of the API.
+For examples of using the API from several popular programming languages including perl, python and ruby, see the Examples section.
+Finally, the Appendices contain an explanation of all input parameters to the API, as well as the output format.
+2. API QuickStart
+The easiest place to start with the API is by accessing it through a web browser. For examples of accessing the API through common programming languages, see the Examples section.
+Most everyone that has read or submitted e-prints on the arXiv is familiar with the arXiv human web interface. These HTML pages can be accessed by opening up your web browser, and entering the following url in your web browser
+http://arxiv.org
+From there, the article listings can be browsed by clicking on one of the many links, or you can search for articles using the search box in the upper right hand side of the page. For example, if I wanted to search for articles that contain the word electron in the title or abstract, I would type electron in the search box, and click Go. If you follow my example, you will see something like this: a web page listing the title and authors of each result, with links to the abstract page, pdf, etc.
+In its simplest form, the API can be used in exactly the same way. However, it uses a few shortcuts so there is less clicking involved. For example, you can see the same search results for electron by entering the url
+http://export.arxiv.org/api/query?search_query=all:electron.
+Alternatively, you can search for articles that contain electron AND proton with the API by entering
+http://export.arxiv.org/api/query?search_query=all:electron+AND+all:proton
+What you see will look different from the HTML interface, but it contains the same information as the search done with the human interface. The reason why the results look different is that the API returns results in the Atom 1.0 format, and not HTML. Since Atom is defined as an XML grammar, it is much easier to digest for programs than HTML. The API is not intended to be used inside a web browser by itself, but this is a particularly simple way to debug a program that does use the API.
+You might notice that your web browser has asked you if you want to “subscribe to this feed” after you enter the API url. This is because Atom is one of the formats used by web sites to syndicate their content. These feeds are usually read with feed reader software, and are what is generated by the existing arXiv rss feeds. The current arXiv feeds only give you updates on new papers within the category you specify. One immediately useful thing to do with the API then is to generate your own feed, based on a custom query!
+To learn more about how to construct custom search queries with the API, see the appendix on the details of query construction. To learn about what information is returned by the API, see the section on the API response. To learn more about writing programs to call the API, and digest the responses, we suggest starting with the section on Structure of the API.
+3. Structure of the API
+In this section, we'll go over some of the details of interacting with the API. A diagram of a typical API call is shown below:
+Example: A typical API call
+Request from url: http://export.arxiv.org/api/query  (1)
+ with parameters: search_query=all:electron
+                .
+                .
+                .
+API server processes the request and sends the response
+                .
+                .
+                .
+Response received by client.  (2)
+The request can be made via HTTP GET, in which the parameters are encoded in the url, or via an HTTP POST in which the parameters are encoded in the HTTP request header. Most client libraries support both methods.
+If all goes well, the HTTP header will show a 200 OK status, and the response body will contain the Atom response content as shown in the example response.
+3.1. Calling the API
+As mentioned above, the API can be called with an HTTP request of type GET or POST. For our purposes, the main difference is that the parameters are included in the url for a GET request, but not for the POST request. Thus if the parameters list is unusually long, a POST request might be preferred.
+The parameters for each of the API methods are explained below. For each method, the base url is
+http://export.arxiv.org/api/{method_name}?{parameters}
+3.1.1. Query Interface
+The API query interface has method_name=query. The table below outlines the parameters that can be passed to the query interface. Parameters are separated with the & sign in the constructed url's.
+query
+parameters	type	defaults	required
+search_query	string	None	No
+id_list	comma-delimited string	None	No
+start	int	0	No
+max_results	int	10	No
+3.1.1.1. SEARCH_QUERY AND ID_LIST LOGIC
+We have already seen the use of search_query in the quickstart section. The search_query takes a string that represents a search query used to find articles. The construction of search_query is described in the search query construction appendix. The id_list contains a comma-delimited list of arXiv id's.
+The logic of these two parameters is as follows:
+If only search_query is given (id_list is blank or not given), then the API will return results for each article that matches the search query.
+If only id_list is given (search_query is blank or not given), then the API will return results for each article in id_list.
+If BOTH search_query and id_list are given, then the API will return each article in id_list that matches search_query. This allows the API to act as a results filter.
+This is summarized in the following table:
+search_query present	id_list present	API returns
+yes	no	articles that match search_query
+no	yes	articles that are in id_list
+yes	yes	articles in id_list that also match search_query
+3.1.1.2. START AND MAX_RESULTS PAGING
+Many times there are hundreds of results for an API query. Rather than download information about all the results at once, the API offers a paging mechanism through start and max_results that allows you to download chucks of the result set at a time. Within the total results set, start defines the index of the first returned result, using 0-based indexing. max_results is the number of results returned by the query. For example, if wanted to step through the results of a search_query of all:electron, we would construct the urls:
+http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=10 (1)
+http://export.arxiv.org/api/query?search_query=all:electron&start=10&max_results=10 (2)
+http://export.arxiv.org/api/query?search_query=all:electron&start=20&max_results=10 (3)
+Get results 0-9
+Get results 10-19
+Get results 20-29
+Detailed examples of how to perform paging in a variety of programming languages can be found in the examples section.
+In cases where the API needs to be called multiple times in a row, we encourage you to play nice and incorporate a 3 second delay in your code. The detailed examples below illustrate how to do this in a variety of languages.
+Because of speed limitations in our implementation of the API, the maximum number of results returned from a single call (max_results) is limited to 30000 in slices of at most 2000 at a time, using the max_results and start query parameters. For example to retrieve matches 6001-8000: http://export.arxiv.org/api/query?search_query=all:electron&start=6000&max_results=8000
+Large result sets put considerable load on the server and also take a long time to render. We recommend to refine queries which return more than 1,000 results, or at least request smaller slices. For bulk metadata harvesting or set information, etc., the OAI-PMH interface is more suitable. A request with max_results >30,000 will result in an HTTP 400 error code with appropriate explanation. A request for 30000 results will typically take a little over 2 minutes to return a response of over 15MB. Requests for fewer results are much faster and correspondingly smaller.
+3.1.1.3. SORT ORDER FOR RETURN RESULTS
+There are two options for for the result set to the API search, sortBy and sortOrder.
+sortBy can be "relevance", "lastUpdatedDate", "submittedDate"
+sortOrder can be either "ascending" or "descending"
+A sample query using these new parameters looks like:
+http://export.arxiv.org/api/query?search_query=ti:"electron thermal conductivity"&sortBy=lastUpdatedDate&sortOrder=ascending
+3.2. The API Response
+Everything returned by the API in the body of the HTTP responses is Atom 1.0, including errors. Atom is a grammar of XML that is popular in the world of content syndication, and is very similar to RSS for this purpose. Typically web sites with dynamic content such as news sites and blogs will publish their content as Atom or RSS feeds. However, Atom is a general format that embodies the concept of a list of items, and thus is well-suited to returning the arXiv search results.
+3.3. Outline of an Atom feed
+In this section we will discuss the contents of the Atom documents returned by the API. To see the full explanation of the Atom 1.0 format, please see the Atom specification.
+An API response consists of an Atom <feed> element which contains metadata about the API call performed, as well as child <entry> elements which embody the metadata for each of the returned results. Below we explain each of the elements and attributes. We will base our discussion on the sample results feed discussed in the examples section.
+You may notice that the results from the API are ordered differently that the results given by the HTML arXiv search interface. The HTML interface automatically sorts results in descending order based on the date of their submission, while the API returns results according to relevancy from the internal search engine. Thus when debugging a search query, we encourage you to use the API within a web browser, rather than the HTML search interface. If you want sorting by date, you can always do this within your programs by reading the <published> tag for each entry as explained below.
+3.3.1. Feed Metadata
+Every response will contain the line:
+<?xml version="1.0" encoding="utf-8"?>
+to signify that we are receiving XML 1.0 with a UTF-8 encoding. Following that line will be a line indicating that we are receiving an Atom feed:
+<feed xmlns="http://www.w3.org/2005/Atom"
+xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/"
+xmlns:arxiv="http://arxiv.org/schemas/atom">
+You will notice that three XML namespaces are defined. The default namespace signifies that we are dealing with Atom 1.0. The other two namespaces define extensions to Atom that we describe below.
+3.3.1.1. <TITLE>, <ID>, <LINK> AND <UPDATED>
+The <title> element gives the title for the feed:
+<title xmlns="http://www.w3.org/2005/Atom">
+    ArXiv Query:  search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1
+</title>
+The title contains a canonicalized version of the query used to call the API. The canonicalization includes all parameters, using their defaults if they were not included, and always puts them in the order search_query,id_list,start,max_results, even if they were specified in a different order in the actual query.
+The <id> element serves as a unique id for this query, and is useful if you are writing a program such as a feed reader that wants to keep track of all the feeds requested in the past. This id can then be used as a key in a database.
+<id xmlns="http://www.w3.org/2005/Atom">
+    http://arxiv.org/api/cHxbiOdZaP56ODnBPIenZhzg5f8
+</id>
+The id is guaranteed to be unique for each query.
+The <link> element provides a URL that can be used to retrieve this feed again.
+<link xmlns="http://www.w3.org/2005/Atom" href="http://arxiv.org/api/query?search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1" rel="self" type="application/atom+xml"/>
+Note that the url in the link represents the canonicalized version of the query. The <link> provides a GET requestable url, even if the original request was done via POST.
+The <updated> element provides the last time the contents of the feed were last updated:
+<updated xmlns="http://www.w3.org/2005/Atom">2007-10-08T00:00:00-04:00</updated>
+Because the arXiv submission process works on a 24 hour submission cycle, new articles are only available to the API on the midnight after the articles were processed. The <updated> tag thus reflects the midnight of the day that you are calling the API. This is very important - search results do not change until new articles are added. Therefore there is no need to call the API more than once in a day for the same query. Please cache your results. This primarily applies to production systems, and of course you are free to play around with the API while you are developing your program!
+3.3.1.2. OPENSEARCH EXTENSION ELEMENTS
+There are several extension elements defined in the OpenSearch namespace
+http://a9.com/-/spec/opensearch/1.1/
+OpenSearch is a lightweight technology that acts in a similar way as the Web Services Description Language. The OpenSearch elements we have included allow OpenSearch enabled clients to digest our results. Such clients often include search result aggregators and browser pluggins that allow searching from a variety of sources.
+The OpenSearch extension elements can still be useful to you even if you are not writing one of these applications. The <opensearch:totalResults> element lists how many results are in the result set for the query:
+<opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">
+   1000
+</opensearch:totalResults>
+This can be very useful when implementing paging of search results. The other two elements <opensearch:startIndex>, and <opensearch:itemsPerPage> are analogous to start, and max_results discussed above.
+<opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">
+   0
+</opensearch:startIndex>
+<opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">
+   1
+</opensearch:itemsPerPage>
+3.3.2. Entry Metadata
+If there are no errors, the <feed> element contains 0 or more child <entry> elements with each <entry> representing an article in the returned results set. As explained in the errors section, if there are errors, a single <entry> element representing the error is returned. Below the element description describes the elements for <entry>'s representing arXiv articles. For a general discussion of arXiv metadata, see the arXiv metadata explanation.
+3.3.2.1. <TITLE>, <ID>, <PUBLISHED>, AND <UPDATED>
+The <title> element contains the title of the article returned:
+<title xmlns="http://www.w3.org/2005/Atom">
+    Multi-Electron Production at High Transverse Momenta in ep Collisions at HERA
+</title>
+The <id> element contains a url that resolves to the abstract page for that article:
+<id xmlns="http://www.w3.org/2005/Atom">
+    http://arxiv.org/abs/hep-ex/0307015
+</id>
+If you want only the arXiv id for the article, you can remove the leading http://arxiv.org/abs/ in the <id>.
+The <published> tag contains the date in which the first version of this article was submitted and processed. The <updated> element contains the date on which the retrieved article was submitted and processed. If the version is version 1, then <published> == <updated>, otherwise they are different. In the example below, the article retrieved was version 2, so <updated> and <published> are different (see the original query).
+<published xmlns="http://www.w3.org/2005/Atom">
+    2007-02-27T16:02:02-05:00
+</published>
+<updated xmlns="http://www.w3.org/2005/Atom">
+    2007-06-25T17:09:59-04:00
+</updated>
+3.3.2.2. <SUMMARY>, <AUTHOR> AND <CATEGORY>
+The <summary> element contains the abstract for the article:
+<summary xmlns="http://www.w3.org/2005/Atom">
+    Multi-electron production is studied at high electron transverse momentum
+    in positron- and electron-proton collisions using the H1 detector at HERA.
+    The data correspond to an integrated luminosity of 115 pb-1. Di-electron
+    and tri-electron event yields are measured. Cross sections are derived in
+    a restricted phase space region dominated by photon-photon collisions. In
+    general good agreement is found with the Standard Model predictions.
+    However, for electron pair invariant masses above 100 GeV, three
+    di-electron events and three tri-electron events are observed, compared to
+    Standard Model expectations of 0.30 \pm 0.04 and 0.23 \pm 0.04,
+    respectively.
+</summary>
+There is one <author> element for each author of the paper in order of authorship. Each <author> element has a <name> sub-element which contains the name of the author.
+<author xmlns="http://www.w3.org/2005/Atom">
+      <name xmlns="http://www.w3.org/2005/Atom">H1 Collaboration</name>
+</author>
+If author affiliation is present, it is included as an <arxiv:affiliation> subelement of the <author> element as discussed below.
+The <category> element is used to describe either an arXiv, ACM, or MSC classification. See the arXiv metadata explanation for more details about these classifications. The <category> element has two attributes, scheme, which is the categorization scheme, and term which is the term used in the categorization. Here is an example from the query http://export.arxiv.org/api/query?id_list=cs/9901002v1
+<category xmlns="http://www.w3.org/2005/Atom" term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
+<category xmlns="http://www.w3.org/2005/Atom" term="cs.AI" scheme="http://arxiv.org/schemas/atom"/>
+<category xmlns="http://www.w3.org/2005/Atom" term="I.2.6" scheme="http://arxiv.org/schemas/atom"/>
+Note that in this example, there are 3 category elements, one for each category. The first two correspond to arXiv categories, and the last one to an ACM category. See <arxiv> extension elements below for information on how to identify the arXiv primary category.
+3.3.2.3. <LINK>'S
+For each entry, there are up to three <link> elements, distinguished by their rel and title attributes. The table below summarizes what these links refer to
+rel	title	refers to	always present
+alternate	-	abstract page	yes
+related	pdf	pdf	yes
+related	doi	resolved doi	no
+For example:
+<link xmlns="http://www.w3.org/2005/Atom" href="http://arxiv.org/abs/hep-ex/0307015v1" rel="alternate" type="text/html"/>
+<link xmlns="http://www.w3.org/2005/Atom" title="pdf" href="http://arxiv.org/pdf/hep-ex/0307015v1" rel="related" type="application/pdf"/>
+<link xmlns="http://www.w3.org/2005/Atom" title="doi" href="http://dx.doi.org/10.1529/biophysj.104.047340" rel="related"/>
+3.3.2.4. <ARXIV> EXTENSION ELEMENTS
+There are several pieces of arXiv metadata that are not able to be mapped onto the standard Atom specification. We have therefore defined several extension elements which live in the arxiv namespace
+http://arxiv.org/schemas/atom
+The arXiv classification system supports multiple <category> tags, as well as a primary classification. The primary classification is a replica of an Atom <category> tag, except it has the name <arxiv:primary_category>. For example, from the query http://export.arxiv.org/api/query?id_list=cs/9901002v1, we have
+<arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
+signifying that cs.LG is the primary arXiv classification for this e-print.
+The <arxiv:comment> element contains the typical author comments found on most arXiv articles:
+<arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">
+   23 pages, 8 figures and 4 tables
+</arxiv:comment>
+If the author has supplied affiliation information, then this is included as an <arxiv:affiliation> subelement of the standard Atom <author> element. For example, from the query http://export.arxiv.org/api/query?id_list=0710.5765v1, we have
+<author>
+   <name>G. G. Kacprzak</name>
+   <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NMSU</arxiv:affiliation>
+</author>
+If the author has provided a journal reference for the article, then there will be a <arxiv:journal_ref> element with this information:
+<arxiv:journal_ref xmlns:arxiv="http://arxiv.org/schemas/atom">
+   Eur.Phys.J. C31 (2003) 17-29
+</arxiv:journal_ref>
+If the author has provided a DOI for the article, then there will be a <arxiv:doi> element with this information:
+<arxiv:doi xmlns:arxiv="http://arxiv.org/schemas/atom">
+   10.1529/biophysj.104.047340
+</arxiv:doi>
+3.4. Errors
+Errors are returned as Atom feeds with a single entry representing the error. The <summary> for the error contains a helpful error message, and the <link> element contains a url to a more detailed explanation of the message.
+For example, the API call http://export.arxiv.org/api/query?id_list=1234.12345 contains a malformed id, and results in the error
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">
+  <link xmlns="http://www.w3.org/2005/Atom" href="http://arxiv.org/api/query?search_query=&amp;id_list=1234.12345" rel="self" type="application/atom+xml"/>
+  <title xmlns="http://www.w3.org/2005/Atom">ArXiv Query: search_query=&amp;id_list=1234.12345</title>
+  <id xmlns="http://www.w3.org/2005/Atom">http://arxiv.org/api/kvuntZ8c9a4Eq5CF7KY03nMug+Q</id>
+  <updated xmlns="http://www.w3.org/2005/Atom">2007-10-12T00:00:00-04:00</updated>
+  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:totalResults>
+  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
+  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>
+  <entry xmlns="http://www.w3.org/2005/Atom">
+    <id xmlns="http://www.w3.org/2005/Atom">http://arxiv.org/api/errors#incorrect_id_format_for_1234.12345</id>
+    <title xmlns="http://www.w3.org/2005/Atom">Error</title>
+    <summary xmlns="http://www.w3.org/2005/Atom">incorrect id format for 1234.12345</summary>
+    <updated xmlns="http://www.w3.org/2005/Atom">2007-10-12T00:00:00-04:00</updated>
+    <link xmlns="http://www.w3.org/2005/Atom" href="http://arxiv.org/api/errors#incorrect_id_format_for_1234.12345" rel="alternate" type="text/html"/>
+    <author xmlns="http://www.w3.org/2005/Atom">
+      <name xmlns="http://www.w3.org/2005/Atom">arXiv api core</name>
+    </author>
+  </entry>
+</feed>
+The following table gives information on errors that might occur.
+Sample query	Error Explanation
+http://export.arxiv.org/api/query?start=not_an_int	start must be an integer
+http://export.arxiv.org/api/query?start=-1	start must be >= 0
+http://export.arxiv.org/api/query?max_results=not_an_int	max_results must be an integer
+http://export.arxiv.org/api/query?max_results=-1	max_results must be >= 0
+http://export.arxiv.org/api/query?id_list=1234.1234	malformed id - see arxiv identifier explanation
+http://export.arxiv.org/api/query?id_list=cond—mat/0709123	malformed id - see arxiv identifier explanation
+4. Examples
+Once you have familiarized yourself with the API, you should be able to easily write programs that call the API automatically. Most programming languages, if not all, have libraries that allow you to make HTTP requests. Since Atom is growing, not all languages have libraries that support Atom parsing, so most of the programming effort will be in digesting the responses you receive. The languages that we know of that can easily handle calling the api via HTTP and parsing the results include:
+Perl (via LWP) (example)
+Python (via urllib) (example)
+Ruby (via uri and net::http) (example)
+PHP (via file_get_contents()) (example)
+4.1. Simple Examples
+Below we include code snippets for these languages that perform the bare minimum functionality - calling the api and printing the raw Atom results. If your favorite language is not up here, write us with an example, and we'll be glad to post it!
+All of the simple examples produce an output which looks like:
+Example: A Typical Atom Response
+    <?xml version="1.0" encoding="utf-8"?>
+    <feed xmlns="http://www.w3.org/2005/Atom" xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/" xmlns:arxiv="http://arxiv.org/schemas/atom">
+      <link xmlns="http://www.w3.org/2005/Atom" href="http://arxiv.org/api/query?search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1" rel="self" type="application/atom+xml"/>
+      <title xmlns="http://www.w3.org/2005/Atom">ArXiv Query: search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1</title>
+      <id xmlns="http://www.w3.org/2005/Atom">http://arxiv.org/api/cHxbiOdZaP56ODnBPIenZhzg5f8</id>
+      <updated xmlns="http://www.w3.org/2005/Atom">2007-10-08T00:00:00-04:00</updated>
+      <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1000</opensearch:totalResults>
+      <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
+      <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>
+      <entry xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
+        <id xmlns="http://www.w3.org/2005/Atom">http://arxiv.org/abs/hep-ex/0307015</id>
+        <published xmlns="http://www.w3.org/2005/Atom">2003-07-07T13:46:39-04:00</published>
+        <updated xmlns="http://www.w3.org/2005/Atom">2003-07-07T13:46:39-04:00</updated>
+        <title xmlns="http://www.w3.org/2005/Atom">Multi-Electron Production at High Transverse Momenta in ep Collisions at
+      HERA</title>
+        <summary xmlns="http://www.w3.org/2005/Atom">  Multi-electron production is studied at high electron transverse momentum in
+    positron- and electron-proton collisions using the H1 detector at HERA. The
+    data correspond to an integrated luminosity of 115 pb-1. Di-electron and
+    tri-electron event yields are measured. Cross sections are derived in a
+    restricted phase space region dominated by photon-photon collisions. In general
+    good agreement is found with the Standard Model predictions. However, for
+    electron pair invariant masses above 100 GeV, three di-electron events and
+    three tri-electron events are observed, compared to Standard Model expectations
+    of 0.30 \pm 0.04 and 0.23 \pm 0.04, respectively.
+    </summary>
+        <author xmlns="http://www.w3.org/2005/Atom">
+          <name xmlns="http://www.w3.org/2005/Atom">H1 Collaboration</name>
+        </author>
+        <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">23 pages, 8 figures and 4 tables</arxiv:comment>
+        <arxiv:journal_ref xmlns:arxiv="http://arxiv.org/schemas/atom">Eur.Phys.J. C31 (2003) 17-29</arxiv:journal_ref>
+        <link xmlns="http://www.w3.org/2005/Atom" href="http://arxiv.org/abs/hep-ex/0307015v1" rel="alternate" type="text/html"/>
+        <link xmlns="http://www.w3.org/2005/Atom" title="pdf" href="http://arxiv.org/pdf/hep-ex/0307015v1" rel="related" type="application/pdf"/>
+        <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="hep-ex" scheme="http://arxiv.org/schemas/atom"/>
+        <category term="hep-ex" scheme="http://arxiv.org/schemas/atom"/>
+      </entry>
+    </feed>
+4.1.1. Perl
+LWP is in the default perl installation on most platforms. It can be downloaded and installed from CPAN. Sample code to produce the above output is:
+    use LWP;
+    use strict;
+    my $url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1';
+    my $browser = LWP::UserAgent->new();
+    my $response = $browser->get($url);
+    print $response->content();
+4.1.2. Python
+The urllib module is part of the python standard library, and is included in any default installation of python. Sample code to produce the above output in Python 2.7 is:
+    import urllib
+    url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1'
+    data = urllib.urlopen(url).read()
+    print data
+wheras in Python 3 an example would be:
+    import urllib.request as libreq
+    with libreq.urlopen('http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1') as url:
+      r = url.read()
+    print(r)
+4.1.3. Ruby
+The net/http and uri modules are part of the ruby standard library, and are included in any default installation of ruby. Sample code to produce the above output is:
+    require 'net/http'
+    require 'uri'
+    url = URI.parse('http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1')
+    res = Net::HTTP.get_response(url)
+    print res.body
+4.1.4. PHP
+The file_get_contents() function is part of the PHP core language:
+    <?php
+    $url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1';
+    $response = file_get_contents($url);
+    print_r($response);
+    ?>
+4.2. Detailed Parsing Examples
+The examples above don't cover how to parse the Atom results returned to extract the information you might be interested in. They also don't cover how to do more advanced programming of the API to perform such tasks as downloading chunks of the full results list one page at a time. The table below contains links to more detailed examples for each of the languages above, as well as to the libraries used to parse Atom.
+Language	Library	Parsing Example	Paging Example
+Perl	XML::Atom	parsing	paging
+Python	feedparser	parsing	paging
+Ruby	feedtools	parsing	paging
+PHP	SimplePie	parsing	paging
+5. Appendices
+5.1. Details of Query Construction
+As outlined in the Structure of the API section, the interface to the API is quite simple. This simplicity, combined with search_query construction, and result set filtering through id_list makes the API a powerful tool for harvesting data from the arXiv. In this section, we outline the possibilities for constructing search_query's to retrieve our desired article lists. We outlined how to use the id_list parameter to filter results sets in search_query and id_list logic.
+In the arXiv search engine, each article is divided up into a number of fields that can individually be searched. For example, the titles of an article can be searched, as well as the author list, abstracts, comments and journal reference. To search one of these fields, we simply prepend the field prefix followed by a colon to our search term. For example, suppose we wanted to find all articles by the author Adrian Del Maestro. We could construct the following query
+http://export.arxiv.org/api/query?search_query=au:del_maestro
+This returns nine results. The following table lists the field prefixes for all the fields that can be searched.
+prefix	explanation
+ti	Title
+au	Author
+abs	Abstract
+co	Comment
+jr	Journal Reference
+cat	Subject Category
+rn	Report Number
+id	Id (use id_list instead)
+all	All of the above
+Note: The id_list parameter should be used rather than search_query=id:xxx to properly handle article versions. In addition, note that all: searches in each of the fields simultaneously.
+The API allows advanced query construction by combining these search fields with Boolean operators. For example, suppose we want to find all articles by the author Adrian DelMaestro that also contain the word checkerboard in the title. We could construct the following query, using the AND operator:
+http://export.arxiv.org/api/query?search_query=au:del_maestro+AND+ti:checkerboard
+As expected, this query picked out the one of the nine previous results with checkerboard in the title. Note that we included + signs in the urls to the API. In a url, a + sign encodes a space, which is useful since spaces are not allowed in url's. It is always a good idea to escape the characters in your url's, which is a common feature in most programming libraries that deal with url's. Note that the <title> of the returned feed has spaces in the query constructed. It is a good idea to look at <title> to see if you have escaped your url correctly.
+The following table lists the three possible Boolean operators.
+AND
+OR
+ANDNOT
+The ANDNOT Boolean operator is particularly useful, as it allows us to filter search results based on certain fields. For example, if we wanted all of the articles by the author Adrian DelMaestro with titles that did not contain the word checkerboard, we could construct the following query:
+http://export.arxiv.org/api/query?search_query=au:del_maestro+ANDNOT+ti:checkerboard
+As expected, this query returns eight results.
+Finally, even more complex queries can be used by using parentheses for grouping the Boolean expressions. To include parentheses in in a url, use %28 for a left-parens (, and %29 for a right-parens ). For example, if we wanted all of the articles by the author Adrian DelMaestro with titles that did not contain the words checkerboard, OR Pyrochore, we could construct the following query:
+http://export.arxiv.org/api/query?search_query=au:del_maestro+ANDNOT+%28ti:checkerboard+OR+ti:Pyrochlore%29
+This query returns three results. Notice that the <title> element displays the parenthesis correctly meaning that we used the correct url escaping.
+So far we have only used single words as the field terms to search for. You can include entire phrases by enclosing the phrase in double quotes, escaped by %22. For example, if we wanted all of the articles by the author Adrian DelMaestro with titles that contain quantum criticality, we could construct the following query:
+http://export.arxiv.org/api/query?search_query=au:del_maestro+AND+ti:%22quantum+criticality%22
+This query returns one result, and notice that the feed <title> contains double quotes as expected. The table below lists the two grouping operators used in the API.
+symbol	encoding	explanation
+( )	%28 %29	Used to group Boolean expressions for Boolean operator precedence.
+double quotes	%22 %22	Used to group multiple words into phrases to search a particular field.
+space	+	Used to extend a search_query to include multiple fields.
+5.1.1. A Note on Article Versions
+Each arXiv article has a version associated with it. The first time an article is posted, it is given a version number of 1. When subsequent corrections are made to an article, it is resubmitted, and the version number is incremented. At any time, any version of an article may be retrieved.
+When using the API, if you want to retrieve the latest version of an article, you may simply enter the arxiv id in the id_list parameter. If you want to retrieve information about a specific version, you can do this by appending vn to the id, where n is the version number you are interested in.
+For example, to retrieve the latest version of cond-mat/0207270, you could use the query http://export.arxiv.org/api/query?id_list=cond-mat/0207270. To retrieve the very first version of this article, you could use the query http://export.arxiv.org/api/query?id_list=cond-mat/0207270v1
+5.2. Details of Atom Results Returned
+The following table lists each element of the returned Atom results. For a more detailed explanation see Outline of an Atom Feed.
+element	explanation
+feed elements
+<title>	The title of the feed containing a canonicalized query string.
+<id>	A unique id assigned to this query.
+<updated>	The last time search results for this query were updated. Set to midnight of the current day.
+<link>	A url that will retrieve this feed via a GET request.
+<opensearch:totalResults>	The total number of search results for this query.
+<opensearch:startIndex>	The 0-based index of the first returned result in the total results list.
+<opensearch:itemsPerPage>	The number of results returned.
+entry elements
+<title>	The title of the article.
+<id>	A url http://arxiv.org/abs/id
+<published>	The date that version 1 of the article was submitted.
+<updated>	The date that the retrieved version of the article was submitted. Same as <published> if the retrieved version is version 1.
+<summary>	The article abstract.
+<author>	One for each author. Has child element <name> containing the author name.
+<link>	Can be up to 3 given url's associated with this article.
+<category>	The arXiv or ACM or MSC category for an article if present.
+<arxiv:primary_category>	The primary arXiv category.
+<arxiv:comment>	The authors comment if present.
+<arxiv:affiliation>	The author's affiliation included as a subelement of <author> if present.
+<arxiv:journal_ref>	A journal reference if present.
+<arxiv:doi>	A url for the resolved DOI to an external resource if present.
+5.3. Subject Classifications
+For the complete list of arXiv subject classifications, please visit the taxonomy page.

benchmarks/bibtex-generation/env/bibtex_generation.py ADDED Viewed

File without changes

benchmarks/bibtex-generation/env/claude_example.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+import anthropic
+client = anthropic.Client(open("claude_api_key.txt").read().strip())
+response = client.completion(
+    prompt=f"{anthropic.HUMAN_PROMPT} How many toes do dogs have?{anthropic.AI_PROMPT}",
+    stop_sequences = [anthropic.HUMAN_PROMPT],
+    model="claude-v1",
+    max_tokens_to_sample=100,
+)
+print(response)

benchmarks/bibtex-generation/env/google_scholar_API_reference.txt ADDED Viewed

	@@ -0,0 +1,153 @@

+Google Scholar API
+build
+754 examples, 0 failures
+Our Google Scholar API allows you to scrape SERP results from a Google Scholar search query. The API is accessed through the following endpoint: /search?engine=google_scholar. A user may query the following: https://serpapi.com/search?engine=google_scholar utilizing a GET request. Head to the playground for a live and interactive demo.
+API Parameters
+Search Query
+q
+Required
+Parameter defines the query you want to search. You can also use helpers in your query such as: author:, or source:.
+Usage of cites parameter makes q optional. Usage of cites together with q triggers search within citing articles.
+Usage of cluster together with q and cites parameters is prohibited. Use cluster parameter only.
+Advanced Google Scholar Parameters
+cites
+Optional
+Parameter defines unique ID for an article to trigger Cited By searches. Usage of cites will bring up a list of citing documents in Google Scholar. Example value: cites=1275980731835430123. Usage of cites and q parameters triggers search within citing articles.
+as_ylo
+Optional
+Parameter defines the year from which you want the results to be included. (e.g. if you set as_ylo parameter to the year 2018, the results before that year will be omitted.). This parameter can be combined with the as_yhi parameter.
+as_yhi
+Optional
+Parameter defines the year until which you want the results to be included. (e.g. if you set as_yhi parameter to the year 2018, the results after that year will be omitted.). This parameter can be combined with the as_ylo parameter.
+scisbd
+Optional
+Parameter defines articles added in the last year, sorted by date. It can be set to 1 to include only abstracts, or 2 to include everything. The default value is 0 which means that the articles are sorted by relevance.
+cluster
+Optional
+Parameter defines unique ID for an article to trigger All Versions searches. Example value: cluster=1275980731835430123. Usage of cluster together with q and cites parameters is prohibited. Use cluster parameter only.
+Localization
+hl
+Optional
+Parameter defines the language to use for the Google Scholar search. It's a two-letter language code. (e.g., en for English, es for Spanish, or fr for French). Head to the Google languages page for a full list of supported Google languages.
+lr
+Optional
+Parameter defines one or multiple languages to limit the search to. It uses lang_{two-letter language code} to specify languages and | as a delimiter. (e.g., lang_fr|lang_de will only search French and German pages). Head to the Google lr languages for a full list of supported languages.
+Pagination
+start
+Optional
+Parameter defines the result offset. It skips the given number of results. It's used for pagination. (e.g., 0 (default) is the first page of results, 10 is the 2nd page of results, 20 is the 3rd page of results, etc.).
+num
+Optional
+Parameter defines the maximum number of results to return, limited to 20. (e.g., 10 (default) returns 10 results, 20 returns 20 results).
+Search Type
+as_sdt
+Optional
+Parameter can be used either as a search type or a filter.
+As a Filter (only works when searching articles):
+0 - exclude patents (default).
+7 - include patents.
+As a Search Type:
+4 - Select case law (US courts only). This will select all the State and Federal courts.
+e.g. as_sdt=4 - Selects case law (all courts)
+To select specific courts, see the full list of supported Google Scholar courts.
+e.g. as_sdt=4,33,192 - 4 is the required value and should always be in the first position, 33 selects all New York courts and 192 selects Tax Court.
+Values have to be separated by comma (,)
+Advanced Filters
+safe
+Optional
+Parameter defines the level of filtering for adult content. It can be set to active, or off (default).
+filter
+Optional
+Parameter defines if the filters for 'Similar Results' and 'Omitted Results' are on or off. It can be set to 1 (default) to enable these filters, or 0 to disable these filters.
+as_vis
+Optional
+Parameter defines whether you would like to include citations or not. It can be set to 1 to exclude these results, or 0 (default) to include them.
+Serpapi Parameters
+engine
+Required
+Set parameter to google_scholar to use the Google Scholar API engine.
+no_cache
+Optional
+Parameter will force SerpApi to fetch the Google Scholar results even if a cached version is already present. A cache is served only if the query and all parameters are exactly the same. Cache expires after 1h. Cached searches are free, and are not counted towards your searches per month. It can be set to false (default) to allow results from the cache, or true to disallow results from the cache. no_cache and async parameters should not be used together.
+async
+Optional
+Parameter defines the way you want to submit your search to SerpApi. It can be set to false (default) to open an HTTP connection and keep it open until you got your search results, or true to just submit your search to SerpApi and retrieve them later. In this case, you'll need to use our Searches Archive API to retrieve your results. async and no_cache parameters should not be used together. async should not be used on accounts with Ludicrous Speed enabled.
+api_key
+Required
+Parameter defines the SerpApi private key to use.
+output
+Optional
+Parameter defines the final output you want. It can be set to json (default) to get a structured JSON of the results, or html to get the raw html retrieved.
+API Results
+JSON Results
+JSON output includes structured data for organic results.
+A search status is accessible through search_metadata.status. It flows this way: Processing -> Success || Error. If a search has failed, error will contain an error message. search_metadata.id is the search ID inside SerpApi.
+HTML Results
+HTML output is useful to debug JSON results or support features not supported yet by SerpApi.
+HTML output gives you the raw HTML results from Google.