diff --git a/.gitignore b/.gitignore
index c6a6c704..a98993a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,7 @@ MNIST/*
log.txt
colabs/log.txt
artifacts
-examples/jax/jax-llm/proteins-base/
\ No newline at end of file
+examples/jax/jax-llm/proteins-base/
+colabs/anthropic/.env
+colabs/anthropic/summarization/.env
+colabs/anthropic/media
\ No newline at end of file
diff --git a/colabs/anthropic/summarization/ReadMe.md b/colabs/anthropic/summarization/ReadMe.md
new file mode 100644
index 00000000..18625b8c
--- /dev/null
+++ b/colabs/anthropic/summarization/ReadMe.md
@@ -0,0 +1,1348 @@
+# Arxiv PDF Summarization Bot using Chain of Density
+
+[](https://colab.research.google.com/github/wandb/examples/blob/add-summarization-example/colabs/anthropic/summarization/chain_of_density_arxiv.ipynb)
+
+
+This cookbook walks through the implementation of an AI-powered summarization bot that extracts concise, information-dense summaries from Arxiv papers using the Chain of Density technique. We'll use Anthropic's Claude API, the Arxiv API, PyPDF2 for PDF processing, and Weave for experiment tracking and evaluation.
+
+## Setup and Imports
+
+First, let's set up our environment and import the necessary libraries we'll need for our project, including:
+- `anthropic` for interacting with Claude API
+- `arxiv` for fetching paper metadata and PDFs
+- `PyPDF2` for PDF text extraction
+- `weave` for experiment tracking and evaluation
+
+## Initializing Weave and Anthropic Client
+
+Next, we initialize Weave for experiment tracking and set up the Anthropic client:
+
+```python
+weave.init("arxiv-chain-of-density-summarization")
+anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+```
+
+This sets up Weave with a specific project name and initializes the Anthropic client using an API key stored in the environment variables.
+
+## Optional: Fetching Arxiv Papers
+
+
+Click to Expand
+
+We implement functions to fetch relevant papers from the Arxiv database:
+
+
+
+```python
+@weave.op()
+def generate_arxiv_query_args(instruction, model="claude-3-sonnet-20240229"):
+ # Define the tools available to the LLM
+ tools = [{
+ "name": "prepare_arxiv_search",
+ "description": "Prepare arguments for ArXiv paper search. This tool generates an optimal query string utilizing Boolean operators, field-specific syntax, and precise search terms. It also determines an efficient maximum number of results to fetch, balancing comprehensive coverage with processing efficiency. The output is tailored to the given research instruction, aiming to provide relevant and focused search results.",
+ "input_schema": {
+ "type": "object",
+ "properties": {
+ "query": {
+ "type": "string",
+ "description": "The ArXiv search query string. Supports Boolean operators (AND, OR, NOT), field-specific syntax (e.g., 'ti:' for title, 'au:' for author), quotation marks for exact phrases, and wildcards. Can include multiple search terms to refine results based on title, abstract, authors, comments, journal reference, subject category, or report number."
+ },
+ "max_results": {
+ "type": "integer",
+ "description": "The maximum number of paper results to return from the ArXiv search. Aims to minimize the number of results while ensuring sufficient coverage of the topic. Defaults to 5 if not specified. Increasing this value broadens the search but may increase processing time and resource usage. Aim to be below 10 articles."
+ }
+ },
+ "required": ["query", "max_results"]
+ }
+ }]
+
+ # Define the system prompt for the LLM
+ system_prompt = """You are an expert at generating ArXiv queries. Use the prepare_arxiv_search tool to create an optimal query and determine the appropriate maximum number of results for the given research question. The query should utilize advanced search techniques including Boolean operators, field-specific syntax, and precise terms to ensure comprehensive yet focused results."""
+
+ # Create the user message with the instruction
+ messages = [
+ {
+ "role": "user",
+ "content": f"Use the prepare_arxiv_search tool to generate an optimal ArXiv query and determine the maximum number of results for the following research instruction: {instruction}"
+ }
+ ]
+
+ # Make the API call to the LLM
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=messages,
+ system=system_prompt,
+ tools=tools
+ )
+
+ # Extract the query and max_results from the response
+ for content in response.content:
+ if content.type == 'tool_use' and content.name == 'prepare_arxiv_search':
+ args = content.input
+ return args.get('query'), args.get('max_results')
+
+ # If no tool use was found, return a default query and the provided max_results
+ return f"{instruction}", 5
+```
+
+
+
+```python
+@weave.op()
+def fetch_arxiv_papers(query, max_results=5):
+ # Initialize the arxiv Client
+ arxiv_client = arxiv.Client()
+
+ # Create the search object with the provided query and max_results
+ search = arxiv.Search(
+ query=query,
+ max_results=max_results,
+ sort_by=arxiv.SortCriterion.Relevance,
+ sort_order=arxiv.SortOrder.Descending
+ )
+
+ # Fetch the results using client.results() and convert them to ArxivPaper objects
+ papers = []
+ for result in arxiv_client.results(search):
+ # Convert the raw arxiv result to our custom ArxivPaper object
+ paper = convert_raw_arxiv_to_pydantic(result)
+ papers.append(paper)
+
+ return papers
+```
+
+These functions use Claude to generate an optimal Arxiv search query based on a given instruction and then fetch the relevant papers using the Arxiv API.
+
+
+
+## Creating Sample Arxiv Paper Objects
+
+For demonstration purposes, we create sample `ArxivPaper` objects:
+
+
+
+```python
+arxiv_paper = ArxivPaper(
+ entry_id="http://arxiv.org/abs/2406.04744v1",
+ updated=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
+ published=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
+ title="CRAG -- Comprehensive RAG Benchmark",
+ authors=[Author(full_name="Xiao Yang")],
+ summary="CRAG: A benchmark for Retrieval-Augmented Generation (RAG) with 4,409 QA pairs across diverse domains.",
+ doi="10.48550/arXiv.2406.04744",
+ primary_category="cs.CL",
+ pdf_url="https://arxiv.org/pdf/2406.04744"
+)
+```
+
+This creates an `ArxivPaper` object with metadata about a specific paper, including its title, authors, summary, and PDF URL. The most important part of this object is the `pdf_url` field, which contains the location of the PDF file.
+
+## PDF Processing
+
+We implement functions to load and process PDFs:
+
+```python
+def load_pdf(arxiv_result):
+ pdf_url = arxiv_result["pdf_url"]
+ response = requests.get(pdf_url)
+ pdf_file = io.BytesIO(response.content)
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
+ return pdf_reader
+```
+
+These functions handle loading PDFs into PyPDF2 making processing the PDFs easier and more efficient.
+
+## Converting PDF Images to Text
+
+One of the key challenges in processing academic papers is handling the visual content, which often includes both raster images and vector graphics. These visuals can contain crucial information that needs to be incorporated into our summarization process. To address this, we leverage Claude 3 Sonnet's advanced vision capabilities to convert these images into detailed textual descriptions.
+
+Here's the implementation of our main image processing function:
+
+
+
+```python
+import base64
+import io
+from pdf2image import convert_from_bytes
+from PIL import Image
+
+@weave.op()
+def extract_images(paper, model="claude-3-5-sonnet-20240620"):
+ pdf_reader = load_pdf(paper)
+ all_images = []
+
+ for page_num, page in enumerate(pdf_reader.pages):
+ images = []
+
+ # Process raster images
+ for image in page.images:
+ img_data = image.data
+ kind = filetype.guess(img_data)
+ if kind is None:
+ print(f"Cannot guess file type for image on page {page_num + 1}")
+ continue
+
+ img_str = base64.b64encode(img_data).decode("utf-8")
+ data_url = f"data:{kind.mime};base64,{img_str}"
+ try:
+ images.append(
+ {"image": data_url, "description": process_figure_image(data_url, model=model)}
+ )
+ except Exception as e:
+ print(f"Error processing image on page {page_num + 1}: {e}")
+ images.append({"image": data_url, "description": ""})
+
+ # Process vector graphics
+ vector_graphics_image_data_url = convert_vector_graphic_page_to_image(page)
+ if vector_graphics_image_data_url:
+ images.append({
+ "image": vector_graphics_image_data_url,
+ "description": process_vector_image_pdf(vector_graphics_image_data_url, model=model)
+ })
+
+ all_images.append(images)
+
+ return all_images
+```
+
+Let's break down the key components and challenges:
+
+### 1. Handling Raster Images
+
+Raster images are typically stored as embedded objects within the PDF. We extract these using PyPDF2's built-in functionality:
+
+```python
+for image in page.images:
+ img_data = image.data
+ # ... process the image data
+```
+
+The challenge here is that these images can be in various formats (PNG, JPEG, etc.). We use the `filetype` library to guess the MIME type, which is crucial for creating a valid data URL:
+
+```python
+kind = filetype.guess(img_data)
+if kind is None:
+ print(f"Cannot guess file type for image on page {page_num + 1}")
+ continue
+
+img_str = base64.b64encode(img_data).decode("utf-8")
+data_url = f"data:{kind.mime};base64,{img_str}"
+```
+
+### 2. Handling Vector Graphics
+
+Vector graphics present a unique challenge because they're not stored as traditional image files within the PDF. Instead, they're often part of the page's content stream. To handle these, we need to convert the entire page to an image:
+
+```python
+vector_graphics_image_data_url = convert_vector_graphic_page_to_image(page)
+if vector_graphics_image_data_url:
+ images.append({
+ "image": vector_graphics_image_data_url,
+ "description": process_vector_image_pdf(vector_graphics_image_data_url, model=model)
+ })
+```
+
+The `convert_vector_graphic_page_to_image` function (collapsed below) uses `pdf2image` to convert the PDF page to a PNG image. This ensures we capture all vector graphics, but it also means we might capture text and other elements on the page.
+
+
+Click to Expand
+
+```python
+def convert_vector_graphic_page_to_image(pdf_page, scale_factor=0.5):
+ # Helper function to handle indirect PDF objects
+ def get_object(obj):
+ if isinstance(obj, PyPDF2.generic.IndirectObject):
+ return obj.get_object()
+ return obj
+
+ # Extract resources from the PDF page
+ resources = get_object(pdf_page.get('/Resources', {}))
+ xobject = get_object(resources.get('/XObject', {}))
+
+ # Check if there's a figure that's not a raster image (i.e., a vector graphic)
+ if xobject:
+ for obj in xobject.values():
+ obj = get_object(obj)
+ # Check if the object is a Form XObject, which typically represents vector graphics
+ if isinstance(obj, dict) and obj.get('/Subtype') == '/Form':
+ # Convert the page to a temporary PDF file in memory
+ pdf_bytes = io.BytesIO()
+ pdf_writer = PyPDF2.PdfWriter()
+ pdf_writer.add_page(pdf_page)
+ pdf_writer.write(pdf_bytes)
+ pdf_bytes.seek(0)
+
+ # Use pdf2image to convert the PDF to a PNG image
+ images = convert_from_bytes(pdf_bytes.getvalue(), fmt='png')
+
+ if images:
+ image = images[0]
+ # Resize the image to reduce memory usage and processing time
+ new_size = (int(image.width * scale_factor), int(image.height * scale_factor))
+ image = image.resize(new_size, Image.LANCZOS)
+
+ # Convert the image to a base64-encoded string
+ img_byte_arr = io.BytesIO()
+ image.save(img_byte_arr, format='PNG')
+ img_byte_arr = img_byte_arr.getvalue()
+ img_str = base64.b64encode(img_byte_arr).decode("utf-8")
+
+ # Return the image as a data URL
+ return f"data:image/png;base64,{img_str}"
+
+ # Return None if no vector graphics were found or conversion was not needed
+ return None
+```
+
+This approach ensures that all vector graphics on the page are captured, even if they can't be directly extracted as separate objects. However, it's important to note that this method will also capture all other content on the page, which may require additional processing or filtering in subsequent steps of the analysis pipeline.
+
+
+
+
+### 3. Using Claude 3 Sonnet for Image Description
+
+The core of our image processing lies in the `process_figure_image` and `process_vector_image_pdf` functions. These functions use Claude 3 Sonnet's vision capabilities to generate detailed descriptions of the images:
+
+```python
+@weave.op()
+def process_figure_image(data_url, model="claude-3-5-sonnet-20240620"):
+ img_str = data_url.split(",")[1]
+
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/png",
+ "data": img_str,
+ },
+ },
+ {
+ "type": "text",
+ "text": """Analyze this image as if it's a figure from a scientific research paper. Provide a detailed technical description addressing the following:
+
+1. Type of figure (e.g., graph, diagram, flowchart, experimental setup)
+2. Key components or variables represented
+3. Relationships or trends depicted
+4. Quantitative information (if present)
+5. Methodology or process illustrated (if applicable)
+6. Potential implications or conclusions that can be drawn
+7. Any limitations or assumptions evident in the figure
+
+Focus on technical accuracy and relevance to scientific research. Avoid general descriptions and concentrate on the specific scientific content presented.""",
+ },
+ ],
+ }
+ ],
+ )
+ return response.content[0].text
+
+@weave.op()
+def process_vector_image_pdf(data_url, model="claude-3-5-sonnet-20240620"):
+ img_str = data_url.split(",")[1]
+
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/png",
+ "data": img_str,
+ },
+ },
+ {
+ "type": "text",
+ "text": """This image is a full page from a scientific paper PDF, converted to PNG format. It may contain one or more vector graphic figures or charts. Your task is to:
+
+1. Identify and focus solely on the vector graphic figures or charts within the page.
+2. For each identified figure or chart, provide a detailed technical analysis addressing:
+
+ a. Type of figure (e.g., graph, diagram, flowchart)
+ b. Key components or variables represented
+ c. Relationships or trends depicted
+ d. Quantitative information (if present)
+ e. Methodology or process illustrated (if applicable)
+ f. Potential implications or conclusions that can be drawn
+
+3. Ignore any text or other elements on the page that are not part of the vector graphic figures.
+4. If multiple figures are present, analyze each separately and clearly indicate which figure you are describing.
+
+Focus on providing accurate, technical descriptions of the vector graphic content only.""",
+ },
+ ],
+ }
+ ],
+ )
+ return response.content[0].text
+```
+
+> The prompts for `process_figure_image` and `process_vector_image_pdf` are tailored to handle different scenarios:
+>
+> 1. **Figure Image Prompt:**
+> - Assumes a single, isolated figure
+> - Focuses on detailed analysis of the specific figure
+> - Includes points about limitations and assumptions
+>
+> 2. **Vector Image PDF Prompt:**
+> - Assumes a full page that may contain multiple vector graphics
+> - Instructs to identify and focus only on vector graphic elements
+> - Asks for separate analysis of each figure if multiple are present
+> - Explicitly tells to ignore text and non-vector graphic elements
+>
+> These differences ensure that Claude 3 Sonnet can accurately process and describe both individual figures and complex pages with multiple vector graphics.
+
+This approach allows us to handle the nuances of different image types within scientific papers. The figure image prompt is designed for standalone images, while the vector image prompt is tailored for full pages that may contain multiple graphics alongside text and other elements.
+
+### 4. Integrating Image Descriptions into the Text
+
+Finally, we integrate the image descriptions into the text of the paper:
+
+
+
+```python
+@weave.op()
+def replace_images_with_descriptions(paper, images):
+ # ... (previous code)
+ if images[page_num] and len(images[page_num]) > 0:
+ text += f"\n\n[Image Descriptions for page {page_num+1}]\n"
+ for image_num, image in enumerate(images[page_num]):
+ text += f"\n[Image {image_num+1}]: {image['description']}\n"
+ text += "[END OF IMAGE DESCRIPTIONS]\n"
+ # ... (rest of the function)
+```
+
+This approach ensures that the image descriptions are clearly demarcated within the text, making it easier for our summarization pipeline to incorporate this visual information.
+
+By implementing this comprehensive image processing pipeline, we ensure that our Chain of Density summarization process can incorporate crucial information from both textual and visual elements of academic papers. This is particularly important for fields where figures and diagrams play a significant role in conveying research findings.
+
+## Chain of Density Summarization
+
+The core of our summarization pipeline is then implemented in the following functions:
+
+
+
+### `summarize_current_summary`:
+ - Forms the foundation of our Chain of Density implementation
+ - Utilizes a carefully crafted prompt to guide the language model
+ - Instructs the model to identify new technical entities
+ - Incorporates new entities into the summary
+ - Increases overall information density while maintaining relevance to the given instruction
+
+```python
+@weave.op()
+def summarize_current_summary(document, instruction, current_summary="", iteration=1, model="claude-3-5-sonnet-20240620"):
+ # Define the maximum number of tokens for the model's response
+ max_tokens = 4096
+
+ # Construct the prompt for the LLM
+ prompt = f"""
+ Document:
+ {document}
+
+ Current summary:
+ {current_summary}
+
+ Instruction to focus on: {instruction}
+
+ Iteration: {iteration}
+
+ Generate an increasingly concise, entity-dense, and highly technical summary from the provided document that specifically addresses the given instruction using the below approach:
+
+ 1. Carefully read the current summary and the instruction.
+
+ 2. Identify 1-3 new, important technical entities or ideas from the original text that:
+ - Are directly relevant to the instruction
+ - Are not yet present in the current summary
+ - Add significant, specific information to the summary
+ - Are preferably 5 words or fewer
+ - May include methodologies, algorithms, metrics, or key findings
+ - Ensure to include this in the output before the summary
+
+ 3. Write a new summary that:
+ - Incorporates the newly identified entities/ideas
+ - Retains all crucial information from the current summary
+ - Increases overall information density
+ - Remains focused on addressing the instruction
+ - Utilizes the response window of {max_tokens} tokens
+
+ Guidelines:
+ - Prioritize technical accuracy and specificity over general readability
+ - Use precise terminology, domain-specific jargon, and include quantitative details where relevant
+ - Ensure all information is directly related to the instruction
+ - Make every word count: rewrite to improve density and make space for new technical entities
+ - Employ fusion, compression, and removal of less informative phrases to increase density
+ - Never drop entities or technical details from the current summary that are relevant to the instruction
+ - Maintain coherence while maximizing information density
+
+ Your goal is to create a summary that is noticeably denser, more technical, and more informative than the previous one, utilizing the response window of {max_tokens} tokens while staying laser-focused on the instruction. The summary should be suitable for an expert audience in the field."""
+
+ # Make the API call to the LLM
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=max_tokens,
+ messages=[{"role": "user", "content": prompt}]
+ )
+
+ # Return the generated summary
+ return response.content[0].text
+```
+
+### `iterative_density_summarization`:
+ - Orchestrates the iterative refinement process
+ - Repeatedly calls `summarize_current_summary`
+ - Uses each iteration's output as input for the next
+ - Allows for gradual accumulation of technical details
+ - Increases density of information progressively
+
+```python
+@weave.op()
+def iterative_density_summarization(document, instruction, current_summary, density_iterations, model):
+ # Initialize a list to store summaries from each iteration
+ iteration_summaries = []
+
+ # Iterate through the specified number of density iterations
+ for iteration in range(1, density_iterations + 1):
+ # Generate a new summary based on the current summary and document
+ current_summary = summarize_current_summary(document, instruction, current_summary, iteration, model)
+
+ # Add the new summary to the list of iteration summaries
+ iteration_summaries.append(current_summary)
+
+ # Print the current iteration and summary for monitoring
+ print(f"Iteration {iteration}:\n{current_summary}\n")
+
+ # Return the final summary and the list of all iteration summaries
+ return current_summary, iteration_summaries
+```
+
+### `final_summary`:
+ - Performs a final condensation step after the iterative process
+ - Aims to reduce summary length by 30-40%
+ - Retains all critical technical content
+ - Optimizes for maximum information density and relevance to the instruction
+
+```python
+@weave.op()
+def final_summary(instruction, current_summary, model):
+ # Construct the prompt for the final summary generation
+ prompt = f"""Given this summary:
+
+{current_summary}
+
+And this instruction to focus on:
+
+{instruction}
+
+Create an extremely dense, final summary that captures all key technical information in the most concise form possible, while specifically addressing the given instruction. Follow these guidelines:
+
+1. Aim to reduce length by 30-40% while retaining all critical technical content relevant to the instruction.
+2. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.
+3. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.
+4. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.
+5. Ensure that all key entities and concepts from the original summary that relate to the instruction are represented.
+6. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.
+7. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints.
+8. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.
+
+The final summary should be a highly concentrated, technical distillation of the research that specifically addresses the given instruction, suitable for specialists in the field."""
+
+ # Make the API call to the LLM for the final summary
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[{"role": "user", "content": prompt}]
+ )
+
+ # Return the generated final summary
+ return response.content[0].text
+```
+
+### `chain_of_density_summarization`:
+ - Serves as the main entry point for the summarization process
+ - Coordinates the entire summarization pipeline
+ - Initiates the iterative summarization
+ - Applies the final condensation
+ - Returns a comprehensive result set including:
+ - Final summary
+ - Accumulated summary
+ - All intermediate summaries
+
+```python
+@weave.op()
+def chain_of_density_summarization(document, instruction, current_summary="", model="claude-3-5-sonnet-20240620", density_iterations=2):
+ # Perform iterative density summarization
+ current_summary, iteration_summaries = iterative_density_summarization(document, instruction, current_summary, density_iterations, model)
+
+ # Generate the final, highly condensed summary
+ final_summary_text = final_summary(instruction, current_summary, model)
+
+ # Print the final summary for monitoring
+ print(f"Final Summary:\n{final_summary_text}\n")
+
+ # Return a dictionary containing all generated summaries
+ return {
+ "final_summary": final_summary_text,
+ "accumulated_summary": current_summary,
+ "iteration_summaries": iteration_summaries,
+ }
+```
+
+This implementation leverages the Chain of Density technique to produce increasingly dense and informative summaries. By iteratively refining the summary and focusing on technical entities and ideas, it generates concise yet highly informative summaries tailored to specific instructions. The process prioritizes technical accuracy, domain-specific terminology, and quantitative details, making it particularly suitable for summarizing complex scientific documents for expert audiences.
+
+## Weave Model Object
+
+We create a Weave Model object to encapsulate our summarization pipeline:
+
+
+
+```python
+class ArxivChainOfDensityPipeline(weave.Model):
+ model: str = "claude-3-5-sonnet-20240620"
+ density_iterations: int = 3
+
+ def __init__(self, model: str = "claude-3-5-sonnet-20240620", density_iterations: int = 3):
+ super().__init__()
+ self.model = model
+ self.density_iterations = density_iterations
+
+ @weave.op()
+ def predict(self, paper: ArxivPaper, instruction: str) -> dict:
+ extracted_images = extract_images(paper)
+ cleaned_text = replace_images_with_descriptions(paper, extracted_images)
+ result = chain_of_density_summarization(cleaned_text, instruction, model=self.model, density_iterations=self.density_iterations)
+ return result
+```
+
+This class encapsulates our summarization pipeline as a Weave Model. By inheriting from `weave.Model` and using the `@weave.op()` decorator, we enable automatic versioning and tracking of inputs, outputs, and code changes. This makes it easy to reproduce experiments and compare results across different model versions or parameter settings.
+
+## Evaluation Dataset
+
+We create an evaluation dataset using sample Arxiv papers and instructions:
+
+
+
+```python
+eval_papers = [arxiv_paper3]
+eval_instructions = [
+ "Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field.",
+]
+
+eval_data = list(product(eval_papers, eval_instructions))
+dataset = weave.Dataset(name="we-paper-reading-eval-data", rows=[{"paper": arxiv_paper, "instruction": instruction, "summary": arxiv_paper.summary} for arxiv_paper, instruction in eval_data])
+weave.publish(dataset)
+```
+
+This creates a Weave Dataset object that combines papers, instructions, and original summaries for evaluation. The `weave.Dataset` class allows us to version and track our evaluation data, ensuring reproducibility of our experiments. By publishing the dataset with `weave.publish()`, we make it available for future use and comparison.
+
+## Evaluation Metrics
+
+We implement several evaluation metrics to assess the quality of our summaries:
+
+
+
+```python
+@weave.op()
+def score_summary(summary, summary_type, instruction, model):
+ openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+ # Construct a detailed prompt for the GPT model to evaluate the summary
+ prompt = f"""Evaluate the quality of the following {summary_type} based on how well it addresses the given instruction. Use the scoring rules below to calculate three numerical scores between 0 and 10.
+
+Instruction: {instruction}
+
+{summary_type}:
+{summary}
+
+Scoring Rules:
+1. Relevance (0-5): [Detailed scoring criteria for relevance]
+2. Technical Quality (0-5): [Detailed scoring criteria for technical quality]
+3. Conciseness (0-5): [Detailed scoring criteria for conciseness]
+
+Provide your evaluation in the following JSON format:
+{{
+ "relevance": {{
+ "score":
+ }},
+ "technical_quality": {{
+ "score":
+ }},
+ "conciseness": {{
+ "score":
+ }}
+}}
+
+Ensure your response is ONLY valid JSON. Do not include any other text outside the JSON object.
+Ensure you have the keys: relevance, technical_quality, conciseness, each containing only a score.
+Ensure each score is a float between 0 and 10, using the scoring rules provided above.
+"""
+
+ # Make an API call to the GPT model for evaluation
+ response = openai_client.chat.completions.create(
+ model=model,
+ messages=[{"role": "user", "content": prompt}],
+ response_format={"type": "json_object"}
+ )
+
+ # Parse and return the JSON response
+ return json.loads(response.choices[0].message.content)
+```
+
+This function uses GPT-4 to evaluate individual summaries based on three criteria:
+- Relevance
+- Technical quality
+- Conciseness
+
+Benefits:
+- Captures nuanced aspects of summary quality
+- Provides a holistic assessment of how well the summary addresses the given instruction
+- Evaluates technical accuracy while considering conciseness
+
+---
+
+```python
+@weave.op()
+def calculate_long_tail_stats(scores):
+ if not scores:
+ return None
+
+ aspects = ['relevance', 'technical_quality', 'conciseness']
+ stats = {}
+
+ for aspect in aspects:
+ try:
+ # Handle different input formats (list of lists or list of dicts)
+ if isinstance(scores[0], list):
+ flattened_scores = [score[aspect]['score'] for sublist in scores for score in sublist]
+ elif isinstance(scores[0], dict):
+ flattened_scores = [score[aspect]['score'] for score in scores]
+ else:
+ print(f"Unexpected format for scores: {scores}")
+ return None
+
+ # Calculate statistics for each aspect
+ stats[aspect] = {
+ "mean": np.mean(flattened_scores),
+ "tail_ratio": np.mean(sorted(flattened_scores)[-max(1, int(len(flattened_scores)*0.05)):]) / np.mean(flattened_scores),
+ }
+ except Exception as e:
+ print(f"Error calculating stats for {aspect}: {str(e)}")
+ stats[aspect] = None
+
+ return stats
+```
+
+This function:
+- Analyzes the distribution of scores across multiple summaries
+- Calculates for each aspect (relevance, technical quality, conciseness):
+ - Mean score
+ - "Tail ratio" (average of top 5% scores compared to overall mean)
+
+Usefulness:
+- Helps identify potential outliers or exceptionally high-quality summaries
+- Provides insight into overall performance of the summarization process
+- Highlights areas where the model excels or needs improvement
+
+---
+
+```python
+@weave.op()
+def analyze_iteration_impact(scores):
+ if len(scores) < 2:
+ return {aspect: {"diminishing_returns_point": 0, "cumulative_improvement": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}
+
+ aspects = ['relevance', 'technical_quality', 'conciseness']
+ results = {}
+
+ for aspect in aspects:
+ aspect_scores = [s[aspect]['score'] for s in scores]
+ improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]
+
+ results[aspect] = {
+ "diminishing_returns_point": next((i for i, imp in enumerate(improvements) if imp <= 0), len(improvements)),
+ "cumulative_improvement": sum(improvements),
+ }
+
+ return results
+```
+
+
+This function:
+- Assesses the improvement of summaries across iterations
+- Key metrics:
+ - Point of diminishing returns (where improvements become negative or zero)
+ - Cumulative improvement for each aspect
+
+Value:
+- Helps optimize the number of iterations in the Chain of Density process
+- Determines when further iterations may no longer yield significant improvements
+
+---
+
+```python
+@weave.op()
+def find_optimal_improvement_range(scores):
+ if len(scores) < 3:
+ return {aspect: {"optimal_range_start": 0, "optimal_range_end": 0, "score_at_start": 0, "score_at_end": 0, "improvement_in_range": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}
+
+ aspects = ['relevance', 'technical_quality', 'conciseness']
+ results = {}
+
+ for aspect in aspects:
+ aspect_scores = [s[aspect]['score'] for s in scores]
+ improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]
+
+ # Calculate moving average of improvements
+ window_size = min(3, len(aspect_scores) - 1)
+ moving_avg = np.convolve(improvements, np.ones(window_size), 'valid') / window_size
+
+ # Find range where improvements are above a threshold
+ threshold = 0.1 * np.mean(improvements)
+ above_threshold = [i for i, avg in enumerate(moving_avg) if avg >= threshold]
+
+ if not above_threshold:
+ optimal_start, optimal_end = 0, 0
+ else:
+ optimal_start = above_threshold[0]
+ optimal_end = above_threshold[-1] + 1
+
+ results[aspect] = {
+ "optimal_range_start": optimal_start,
+ "optimal_range_end": optimal_end,
+ "score_at_start": aspect_scores[optimal_start],
+ "score_at_end": aspect_scores[optimal_end] if optimal_end < len(aspect_scores) else aspect_scores[-1],
+ "improvement_in_range": sum(improvements[optimal_start:optimal_end])
+ }
+
+ return results
+```
+
+
+This function:
+- Determines the most effective range of iterations for improvement
+- Methodology:
+ - Uses moving average of improvements to identify sustained progress
+ - Finds optimal range where improvements are above a certain threshold
+
+Benefits:
+- Aids in fine-tuning the Chain of Density process
+- Identifies the most productive iteration range for each aspect of summary quality
+
+---
+
+```python
+@weave.op()
+def find_optimal_score_range(scores):
+ if len(scores) < 2:
+ return {aspect: {"optimal_range_start": 0, "optimal_range_end": 0, "highest_score": 0, "improvement_in_range": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}
+
+ aspects = ['relevance', 'technical_quality', 'conciseness']
+ results = {}
+
+ for aspect in aspects:
+ aspect_scores = [s[aspect]['score'] for s in scores]
+ improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]
+
+ highest_score = max(aspect_scores)
+ highest_score_index = aspect_scores.index(highest_score)
+
+ # Find the best range leading up to the highest score
+ best_start = 0
+ best_end = highest_score_index
+ best_improvement = sum(improvements[:highest_score_index])
+
+ for start in range(highest_score_index):
+ current_improvement = sum(improvements[start:highest_score_index])
+ if current_improvement > best_improvement:
+ best_start = start
+ best_improvement = current_improvement
+
+ results[aspect] = {
+ "optimal_range_start": best_start,
+ "optimal_range_end": highest_score_index,
+ "score_at_start": aspect_scores[best_start],
+ "score_at_end": highest_score,
+ "improvement_in_range": best_improvement
+ }
+
+ return results
+```
+
+
+This function:
+- Identifies the iteration range producing the highest quality summaries
+- Process:
+ - Finds the range leading up to the highest score for each aspect
+ - Considers cumulative improvement within the range
+
+Usefulness:
+- Helps understand which iterations contribute most significantly to final summary quality
+- Assists in optimizing the summarization process for maximum effectiveness
+
+---
+
+```python
+@weave.op()
+def process_iteration_summaries(model_output, instruction, model):
+ iteration_scores = [score_summary(summary, f"Iteration Summary {i+1}", instruction, model)
+ for i, summary in enumerate(model_output["iteration_summaries"])]
+ return {
+ "long_tail_stats": calculate_long_tail_stats(iteration_scores),
+ # Additional analyses can be added here if needed
+ }
+```
+
+
+This function:
+- Aggregates and analyzes scores across all summarization iterations
+- Provides:
+ - Holistic view of summary quality evolution throughout Chain of Density iterations
+ - Comprehensive analysis of the iterative summarization approach
+
+Value:
+- Helps understand overall effectiveness of the iterative process
+- Identifies trends in quality improvement across iterations
+
+---
+
+```python
+@weave.op()
+def quality_scorer(instruction, model_output, model="gpt-4o"):
+ scores = {
+ "iteration_summaries_analysis": {},
+ "accumulated_summary": {},
+ "final_summary": {}
+ }
+
+ try:
+ # Process iteration summaries
+ scores["iteration_summaries_analysis"] = process_iteration_summaries(model_output, instruction, model)
+
+ # Score accumulated summary
+ scores["accumulated_summary"] = score_summary(model_output["accumulated_summary"], "Accumulated Summary", instruction, model)
+
+ # Score final summary
+ scores["final_summary"] = score_summary(model_output["final_summary"], "Final Summary", instruction, model)
+
+ # Flatten the scores dictionary for easier analysis
+ flattened_scores = {}
+ for key, value in scores.items():
+ if isinstance(value, dict):
+ flattened_scores[key] = flatten_dict(value)
+ else:
+ flattened_scores[key] = value
+
+ scores = flatten_dict(flattened_scores)
+
+ except Exception as e:
+ print(f"Error in quality_scorer: {str(e)}")
+ scores["error"] = str(e)
+
+ return scores
+```
+
+
+This function:
+- Serves as the main entry point for evaluating summarization quality
+- Features:
+ - Combines all previous metrics into a comprehensive evaluation
+ - Analyzes iteration summaries, accumulated summary, and final summary
+
+Benefits:
+- Provides a detailed, multi-faceted assessment of the summarization pipeline's performance
+- Offers insights into various aspects of summary quality
+- Evaluates the effectiveness of the Chain of Density process as a whole
+
+---
+
+These evaluation metrics collectively provide a robust framework for assessing the quality and effectiveness of our Chain of Density summarization pipeline. By examining multiple aspects of summary quality across different stages of the process, we can gain valuable insights into the strengths and weaknesses of our approach, identify areas for improvement, and optimize the summarization process for maximum effectiveness.
+
+## Running the Evaluation
+
+Finally, we set up and run the evaluation:
+
+
+
+```python
+models = [
+ "claude-3-opus-20240229",
+ "claude-3-haiku-20240307",
+ "claude-3-5-sonnet-20240620"
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[quality_scorer])
+for model in models:
+ arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline(model=model, density_iterations=8)
+ await evaluation.evaluate(arxiv_chain_of_density_pipeline)
+```
+
+This code sets up a Weave Evaluation object and runs the evaluation for each model in our list.
+
+## Optional: Advanced Chunking Technique
+
+
+Click to Expand
+
+The cookbook also includes an optional section on an advanced chunking technique to handle longer documents more effectively:
+
+### Chunking
+
+1. `chunk_text`: Splits the input text into manageable chunks, handling special cases like image descriptions.
+
+```python
+@weave.op()
+def chunk_text(text, chunk_size):
+ chunks = []
+ current_chunk = ""
+ lines = text.split('\n')
+
+ i = 0
+ while i < len(lines):
+ line = lines[i]
+ # If adding this line would exceed the chunk size, start a new chunk
+ if len(current_chunk) + len(line) > chunk_size:
+ if current_chunk:
+ chunks.append(current_chunk.strip())
+ current_chunk = ""
+
+ current_chunk += line + "\n"
+
+ # Special handling for image descriptions
+ if line.startswith("[Image Descriptions for page"):
+ if current_chunk.strip():
+ chunks.append(current_chunk.strip())
+ current_chunk = ""
+
+ # Collect all lines of the image description
+ image_descriptions = line + "\n"
+ i += 1
+ while i < len(lines) and not lines[i].startswith("[END OF IMAGE DESCRIPTIONS]"):
+ image_descriptions += lines[i] + "\n"
+ i += 1
+ if i < len(lines):
+ image_descriptions += lines[i] + "\n"
+
+ # Add the entire image description as a separate chunk
+ chunks.append(image_descriptions.strip())
+ current_chunk = ""
+ else:
+ i += 1
+
+ # Add any remaining text as the last chunk
+ if current_chunk:
+ chunks.append(current_chunk.strip())
+
+ # Combine smaller chunks to reach the desired chunk size
+ combined_chunks = []
+ current_combined_chunk = ""
+ for chunk in chunks:
+ if len(current_combined_chunk) + len(chunk) <= chunk_size:
+ current_combined_chunk += chunk + "\n\n"
+ else:
+ if current_combined_chunk:
+ combined_chunks.append(current_combined_chunk.strip())
+ current_combined_chunk = chunk + "\n\n"
+
+ if current_combined_chunk:
+ combined_chunks.append(current_combined_chunk.strip())
+
+ return combined_chunks
+```
+
+2. `summarize_chunk`: Summarizes an individual chunk, focusing on the given instruction and incorporating previous summary information.
+
+```python
+@weave.op()
+def summarize_chunk(chunk, instruction, current_summary="", iteration=1, model="claude-3-5-sonnet-20240620"):
+ # Construct a prompt for summarizing the chunk
+ prompt = f"""Current summary:
+ {current_summary}
+
+ New information:
+ {chunk}
+
+ Instruction to focus on: {instruction}
+
+ Iteration: {iteration}
+
+ Create an extremely dense, highly technical summary that specifically addresses the given instruction. Follow these steps:
+
+ 1. Identify 3-5 key technical points from the new information that are directly relevant to the instruction, prioritizing:
+ - Novel methodologies or algorithms related to the instruction
+ - Specific quantitative results or metrics that address the instruction
+ - Detailed experimental setups or parameters pertinent to the instruction
+ - Precise definitions of domain-specific concepts mentioned in the instruction
+ - Critical limitations or assumptions in the research that affect the instruction
+
+ 1. Integrate these points with the current summary, ensuring:
+ - Direct relevance to the instruction at hand
+ - No redundancy or oversimplification
+ - Preservation of technical nuances and complexities specific to the instruction
+ - Inclusion of relevant equations, formulas, or mathematical notations that help address the instruction
+ - Accurate representation of statistical significance and error margins for instruction-related data
+
+ 1. Rephrase the combined information to maximize information density while maintaining focus on the instruction:
+ - Use domain-specific terminology and jargon without simplification, as relevant to the instruction
+ - Maintain the level of detail expected in a PhD-level discourse on the specific topic of the instruction
+ - Incorporate precise citations or references where applicable to support the response
+ - Preserve any conflicting viewpoints or ongoing debates in the field that relate to the instruction
+
+ 1. With each iteration, aim to increase information density by 30-40% without sacrificing technical accuracy or critical details that address the instruction.
+
+ 2. Ensure the summary includes instruction-specific:
+ - Methodological details (e.g., exact algorithms, parameter settings) that are crucial to addressing the instruction
+ - Precise quantitative results with appropriate units and error bounds that directly relate to the instruction
+ - Detailed descriptions of novel techniques or approaches that are key to addressing the instruction
+ - Critical analysis of strengths and limitations in the research as they pertain to the instruction
+
+ Produce a summary that is significantly more information-dense and technically precise than the previous one, while remaining laser-focused on addressing the given instruction. Use language appropriate for a highly specialized audience in the field."""
+
+ # Use the Anthropic API to generate the summary
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[{"role": "user", "content": prompt}]
+ )
+ return response.content[0].text
+```
+
+3. `summarize_chunk_summaries`: Combines summaries from multiple chunks into a coherent whole.
+
+```python
+@weave.op()
+def summarize_chunk_summaries(instruction, current_summary, chunk_summaries, model="claude-3-opus-20240229"):
+ # Construct a prompt for combining chunk summaries
+ prompt = f"""Given this current summary:
+
+ {current_summary}
+
+ And these chunk summaries:
+
+ {' '.join(chunk_summaries)}
+
+ And this instruction to focus on:
+
+ {instruction}
+
+ Create an extremely dense, final summary that refines the current summary by incorporating key information from the chunk summaries, while specifically addressing the given instruction. Follow these guidelines:
+
+ 1. Integrate the most relevant and important information from the chunk summaries into the current summary.
+ 2. Ensure all key technical content from both the current summary and chunk summaries that relates to the instruction is retained.
+ 3. Aim to reduce overall length by 30-40% while increasing information density.
+ 4. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.
+ 5. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.
+ 6. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.
+ 7. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.
+ 8. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints from across all summaries.
+ 9. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.
+
+ The final summary should be a highly concentrated, technical distillation of all provided summaries that specifically addresses the given instruction, suitable for specialists in the field."""
+
+ # Use the Anthropic API to generate the combined summary
+ return anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[{"role": "user", "content": prompt}],
+ ).content[0].text
+```
+
+4. `summarize_chunk_iteration`: Manages the process of summarizing all chunks in a single iteration.
+
+```python
+@weave.op()
+def summarize_chunk_iteration(chunks, instruction, current_summary, iteration, model):
+ chunk_summaries = []
+ # Summarize each chunk individually
+ for i, chunk in enumerate(chunks, 1):
+ current_summary = summarize_chunk(chunk, instruction, current_summary, iteration, model)
+ chunk_summaries.append(current_summary)
+ print(f"Iteration {iteration}, Chunk {i}:\n{current_summary}\n")
+ # Combine all chunk summaries into a single summary
+ current_summary = summarize_chunk_summaries(instruction, current_summary, chunk_summaries, model)
+ print(f"Iteration {iteration}, Final Summary:\n{current_summary}\n")
+ return current_summary, chunk_summaries
+```
+
+5. `iterative_chunk_summarization`: Performs multiple iterations of chunk-based summarization.
+
+```python
+@weave.op()
+def iterative_chunk_summarization(chunks, instruction, current_summary, chunk_iterations, model):
+ chunk_iteration_summaries = []
+ chunk_summaries = []
+ # Perform multiple iterations of chunk summarization
+ for iteration in range(1, chunk_iterations + 1):
+ current_summary, iteration_chunk_summaries = summarize_chunk_iteration(chunks, instruction, current_summary, iteration, model)
+ chunk_iteration_summaries.append(current_summary)
+ chunk_summaries.append(iteration_chunk_summaries)
+ return current_summary, chunk_iteration_summaries, chunk_summaries
+```
+
+6. `chain_of_density_summarization`: Orchestrates the entire summarization process, including both chunk-based and density-based summarization steps.
+
+
+
+```python
+@weave.op()
+def chain_of_density_summarization(instruction, text, model="claude-3-5-sonnet-20240620", chunk_size=8192, chunk_iterations=2, density_iterations=2):
+ # Split the text into chunks
+ chunks = chunk_text(text, chunk_size)
+ print(f"Number of chunks: {len(chunks)}")
+ print(f"Chunk sizes: {[len(chunk) for chunk in chunks]}")
+
+ # Perform chunk-based summarization
+ current_summary, chunk_iteration_summaries, chunk_summaries = iterative_chunk_summarization(chunks, instruction, "", chunk_iterations, model)
+
+ # Perform final density-based summarization
+ current_summary, iteration_summaries = iterative_density_summarization(instruction, current_summary, density_iterations, model)
+ final_summary_text = final_summary(instruction, current_summary, model)
+ print(f"Final Summary:\n{final_summary_text}\n")
+
+ # Return all intermediate and final results
+ return {
+ "final_summary": final_summary_text,
+ "accumulated_summary": current_summary,
+ "iteration_summaries": iteration_summaries,
+ "chunk_iteration_summaries": chunk_iteration_summaries,
+ "chunk_summaries": chunk_summaries
+ }
+```
+
+This advanced chunking technique allows for more effective handling of longer documents, potentially improving the quality and comprehensiveness of the final summary.
+
+### Model Evaluation
+
+> Note that the `ArxivChainOfDensityPipeline` class stays identical as the same `chain_of_density_summarization` function is replaced.
+
+## Advanced Evaluation Metrics
+
+To thoroughly assess the quality and effectiveness of our Chain of Density summarization pipeline, we implement a set of advanced evaluation metrics. These metrics provide a comprehensive analysis of the summarization process, taking into account both the chunk-based approach and the overall summary quality.
+
+### Processing Chunk Summaries
+
+The `process_chunk_summaries` function evaluates the quality of individual chunk summaries:
+
+```python
+def process_chunk_summaries(model_output, instruction, model):
+ scores = {}
+ for i, chunk_list in enumerate(model_output["chunk_summaries"]):
+ chunk_summary_scores = []
+ for j, summary in enumerate(chunk_list):
+ chunk_summary_score = score_summary(summary, f"Chunk Summary {i+1}.{j+1}", instruction, model)
+ chunk_summary_scores.append(chunk_summary_score)
+
+ scores[f"chunk_summaries_analysis_{i+1}"] = {
+ "long_tail_stats": calculate_long_tail_stats(chunk_summary_scores),
+ "iteration_impact": analyze_iteration_impact(chunk_summary_scores),
+ "optimal_improvement_range": find_optimal_improvement_range(chunk_summary_scores),
+ "optimal_score_range": find_optimal_score_range(chunk_summary_scores)
+ }
+ return scores
+```
+
+This function:
+- Scores each chunk summary individually
+- Calculates various statistics for each chunk iteration, including long-tail stats, iteration impact, and optimal improvement ranges
+
+### Processing Chunk Iteration Summaries
+
+The `process_chunk_iteration_summaries` function evaluates the quality of summaries produced after each chunk iteration:
+
+```python
+def process_chunk_iteration_summaries(model_output, instruction, model):
+ chunk_iteration_scores = [
+ score_summary(summary, f"Chunk Iteration Summary {i+1}", instruction, model)
+ for i, summary in enumerate(model_output["chunk_iteration_summaries"])
+ ]
+
+ return {
+ "long_tail_stats": calculate_long_tail_stats(chunk_iteration_scores),
+ "iteration_impact": analyze_iteration_impact(chunk_iteration_scores),
+ "optimal_improvement_range": find_optimal_improvement_range(chunk_iteration_scores),
+ "optimal_score_range": find_optimal_score_range(chunk_iteration_scores)
+ }
+```
+
+This function:
+- Scores each chunk iteration summary
+- Calculates aggregate statistics across all chunk iterations
+
+### Quality Scorer
+
+The `quality_scorer` function serves as the main entry point for our evaluation process:
+
+```python
+@weave.op()
+def quality_scorer(instruction, model_output, model="gpt-4o"):
+ scores = {
+ "chunk_summaries_analysis": {},
+ "chunk_iteration_summaries_analysis": {},
+ "iteration_summaries_analysis": {},
+ "accumulated_summary": {},
+ "final_summary": {}
+ }
+
+ try:
+ chunk_summaries_scores = process_chunk_summaries(model_output, instruction, model)
+ scores.update(chunk_summaries_scores)
+
+ scores["chunk_iteration_summaries_analysis"] = process_chunk_iteration_summaries(model_output, instruction, model)
+ scores["iteration_summaries_analysis"] = process_iteration_summaries(model_output, instruction, model)
+ scores["accumulated_summary"] = score_summary(model_output["accumulated_summary"], "Accumulated Summary", instruction, model)
+ scores["final_summary"] = score_summary(model_output["final_summary"], "Final Summary", instruction, model)
+
+ flattened_scores = {}
+ for key, value in scores.items():
+ if isinstance(value, dict):
+ flattened_scores[key] = flatten_dict(value)
+ else:
+ flattened_scores[key] = value
+
+ scores = flatten_dict(flattened_scores)
+
+ except Exception as e:
+ print(f"Error in quality_scorer: {str(e)}")
+ scores["error"] = str(e)
+
+ return scores
+```
+
+This function:
+- Orchestrates the entire evaluation process
+- Processes and scores chunk summaries, chunk iteration summaries, and the final summary
+- Flattens the nested score dictionary for easier analysis
+- Handles any errors that occur during the scoring process
+
+By implementing these advanced evaluation metrics, we can gain deep insights into the performance of our Chain of Density summarization pipeline at various stages of the process. This allows us to identify areas for improvement and optimize our approach for maximum effectiveness.
+
+
+
+## Conclusion
+
+This cookbook has demonstrated the implementation of an advanced AI-powered summarization bot using the Chain of Density technique. By leveraging Anthropic's Claude API, the Arxiv API, and Weave for experiment tracking, we've created a powerful tool for generating concise, information-dense summaries of scientific papers.
+
+Key takeaways:
+1. The Chain of Density technique allows for iterative refinement of summaries, increasing information density while maintaining relevance to specific instructions.
+2. Our implementation handles both textual content and visual elements (images and vector graphics) from PDF papers, ensuring comprehensive coverage of research content.
+3. The optional advanced chunking technique enables effective processing of longer documents, improving summary quality for extensive research papers.
+4. Robust evaluation metrics provide insights into the summarization process, allowing for continuous improvement and optimization.
+
+Potential applications:
+- Rapid literature review for researchers
+- Automated creation of paper abstracts or extended summaries
+- Assisting in the peer review process by providing concise overviews of submissions
+- Enhancing search and discovery of relevant research papers
+
+By combining state-of-the-art language models with carefully crafted prompts and evaluation techniques, this summarization pipeline demonstrates the potential for AI to significantly accelerate and enhance scientific research processes.
diff --git a/colabs/anthropic/summarization/arxiv_models.py b/colabs/anthropic/summarization/arxiv_models.py
new file mode 100644
index 00000000..6c850c8d
--- /dev/null
+++ b/colabs/anthropic/summarization/arxiv_models.py
@@ -0,0 +1,59 @@
+from pydantic import BaseModel
+from typing import List, Optional
+from datetime import datetime
+
+
+class Author(BaseModel):
+ full_name: str
+
+
+class Link(BaseModel):
+ href: str
+ title: Optional[str] = None
+ rel: Optional[str] = None
+ content_type: Optional[str] = None
+
+
+class ArxivPaper(BaseModel):
+ entry_id: str
+ updated: datetime
+ published: datetime
+ title: str
+ authors: List[Author]
+ summary: str
+ comment: Optional[str] = None
+ journal_ref: Optional[str] = None
+ doi: Optional[str] = None
+ primary_category: str
+ categories: List[str]
+ links: List[Link]
+ pdf_url: Optional[str] = None
+
+ def __getitem__(self, key):
+ return getattr(self, key)
+
+
+def convert_raw_arxiv_to_pydantic(paper):
+ return ArxivPaper(
+ entry_id=paper.entry_id,
+ updated=paper.updated,
+ published=paper.published,
+ title=paper.title,
+ authors=[Author(full_name=str(author)) for author in paper.authors],
+ summary=paper.summary,
+ comment=paper.comment,
+ journal_ref=paper.journal_ref,
+ doi=paper.doi,
+ primary_category=paper.primary_category,
+ categories=paper.categories,
+ links=[
+ Link(
+ href=link.href,
+ title=link.title,
+ rel=link.rel,
+ content_type=link.content_type,
+ )
+ for link in paper.links
+ ],
+ pdf_url=paper.pdf_url,
+ )
diff --git a/colabs/anthropic/summarization/chain_of_density_arxiv.ipynb b/colabs/anthropic/summarization/chain_of_density_arxiv.ipynb
new file mode 100644
index 00000000..a4866871
--- /dev/null
+++ b/colabs/anthropic/summarization/chain_of_density_arxiv.ipynb
@@ -0,0 +1,2342 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "AOZk1uhGdPd-"
+ },
+ "source": [
+ "[](https://colab.research.google.com/github/wandb/examples/blob/add-summarization-example/colabs/anthropic/summarization/chain_of_density_arxiv.ipynb)\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CUeQinM0dPeC"
+ },
+ "source": [
+ "# Arxiv PDF Summarization Bot"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2N85PjO9dPeC"
+ },
+ "source": [
+ "## Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "_js2H0ojdPeC"
+ },
+ "outputs": [],
+ "source": [
+ "IN_COLAB = False\n",
+ "try:\n",
+ " from google.colab import userdata\n",
+ " import os\n",
+ " os.environ[\"WANDB_API_KEY\"] = userdata.get(\"WANDB_API_KEY\")\n",
+ " os.environ[\"OPENAI_API_KEY\"] = userdata.get(\"OPENAI_API_KEY\")\n",
+ " os.environ[\"ANTHROPIC_API_KEY\"] = userdata.get(\"ANTHROPIC_API_KEY\")\n",
+ " !apt-get install poppler-utils\n",
+ " IN_COLAB = True\n",
+ "except:\n",
+ " from dotenv import load_dotenv\n",
+ " load_dotenv()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "rNvKQGA2dPeD",
+ "outputId": "2059badb-9248-4093-c171-325b4b084cbc"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Folder 'weave_cookbooks' already exists.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import subprocess\n",
+ "import shutil\n",
+ "\n",
+ "repo_url = \"https://github.com/wandb/examples.git\"\n",
+ "target_folder = \"examples\"\n",
+ "subdirectory = \"colabs/anthropic/summarization\"\n",
+ "branch = \"add-summarization-example\"\n",
+ "\n",
+ "if not os.path.exists(target_folder) and IN_COLAB:\n",
+ " print(f\"Cloning repository: {repo_url}\")\n",
+ "\n",
+ " # Clone the entire repository to a temporary folder\n",
+ " temp_folder = \"temp_weave_repo\"\n",
+ " subprocess.run([\"git\", \"clone\", \"--depth\", \"1\", \"--branch\", branch, repo_url, temp_folder], check=True)\n",
+ "\n",
+ " # Move the desired subdirectory to the target folder\n",
+ " shutil.move(os.path.join(temp_folder, subdirectory), target_folder)\n",
+ "\n",
+ " # Remove the temporary folder\n",
+ " shutil.rmtree(temp_folder)\n",
+ "\n",
+ " print(f\"Successfully cloned {subdirectory} from branch '{branch}' to {target_folder}\")\n",
+ " \n",
+ "else:\n",
+ " print(f\"Folder '{target_folder}' already exists.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if os.path.exists(target_folder) and IN_COLAB:\n",
+ " %cd weave_cookbooks/summarization\n",
+ " !pip install -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "id": "uxUL9gcGdPeD"
+ },
+ "outputs": [],
+ "source": [
+ "import base64\n",
+ "import json\n",
+ "import os\n",
+ "from datetime import datetime, timezone\n",
+ "from itertools import product\n",
+ "\n",
+ "import anthropic\n",
+ "import filetype\n",
+ "import numpy as np\n",
+ "import PyPDF2\n",
+ "import requests\n",
+ "import arxiv\n",
+ "from arxiv_models import ArxivPaper, Author, Link, convert_raw_arxiv_to_pydantic\n",
+ "from dotenv import load_dotenv\n",
+ "from openai import OpenAI\n",
+ "from pdf2image import convert_from_bytes\n",
+ "from PIL import Image\n",
+ "\n",
+ "import weave\n",
+ "import io"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "iydmmNfodPeE",
+ "outputId": "8add2ae1-b816-4485-f018-5f9c84264a49"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Logged in as Weights & Biases user: a-sh0ts.\n",
+ "View Weave data at https://wandb.ai/a-sh0ts/arxiv-chain-of-density-summarization-llama-questions/weave\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weave.init(\"arxiv-chain-of-density-summarization\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "dOCyzHN_dPeE"
+ },
+ "outputs": [],
+ "source": [
+ "anthropic_client = anthropic.Anthropic(api_key=os.getenv(\"ANTHROPIC_API_KEY\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "y_VcCrQOdPeE"
+ },
+ "outputs": [],
+ "source": [
+ "def flatten_dict(d, parent_key='', sep='_'):\n",
+ " items = []\n",
+ " for k, v in d.items():\n",
+ " new_key = f\"{parent_key}{sep}{k}\" if parent_key else k\n",
+ " if isinstance(v, dict):\n",
+ " items.extend(flatten_dict(v, new_key, sep=sep).items())\n",
+ " else:\n",
+ " items.append((new_key, v))\n",
+ " return dict(items)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "nkffmCe9dPeF"
+ },
+ "source": [
+ "## (Optional) Fetch Arxiv Papers"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jmI98U89dPeF"
+ },
+ "source": [
+ "This section demonstrates how to fetch relevant papers from the ArXiv database based on a given research instruction. This step is optional but can be useful if you want to dynamically retrieve papers for summarization instead of using predefined examples.\n",
+ "\n",
+ "### Generate ArXiv Query Arguments\n",
+ "\n",
+ "We use the `generate_arxiv_query_args` function to create an optimal ArXiv search query and determine the appropriate number of results to fetch. This function leverages Claude to generate a well-crafted query string and suggest a suitable `max_results` value.\n",
+ "\n",
+ "```python\n",
+ "instruction = \"Answer the following question: What are the latest advancements in audio music information retrieval?\"\n",
+ "arxiv_query, max_results = generate_arxiv_query_args(instruction)\n",
+ "print(f\"ArXiv query: {arxiv_query}\")\n",
+ "print(f\"Max results: {max_results}\")\n",
+ "```\n",
+ "\n",
+ "### Fetch ArXiv Papers\n",
+ "\n",
+ "Once we have the query and max_results, we can use the `fetch_arxiv_papers` function to retrieve the relevant papers from ArXiv. This function returns a list of `ArxivPaper` objects, which contain metadata about each paper, including its title, authors, abstract, and PDF URL.\n",
+ "\n",
+ "```python\n",
+ "arxiv_papers = fetch_arxiv_papers(arxiv_query, max_results)\n",
+ "```\n",
+ "\n",
+ "By uncommenting and running these code snippets, you can dynamically fetch ArXiv papers based on your research interests. This allows for a more flexible and customizable summarization pipeline, enabling you to process and summarize the most recent and relevant research in your field of interest."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "id": "ECq1y7iUdPeF"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def generate_arxiv_query_args(instruction, model=\"claude-3-sonnet-20240229\"):\n",
+ " tools = [{\n",
+ " \"name\": \"prepare_arxiv_search\",\n",
+ " \"description\": \"Prepare arguments for ArXiv paper search. This tool generates an optimal query string utilizing Boolean operators, field-specific syntax, and precise search terms. It also determines an efficient maximum number of results to fetch, balancing comprehensive coverage with processing efficiency. The output is tailored to the given research instruction, aiming to provide relevant and focused search results.\",\n",
+ " \"input_schema\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"query\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The ArXiv search query string. Supports Boolean operators (AND, OR, NOT), field-specific syntax (e.g., 'ti:' for title, 'au:' for author), quotation marks for exact phrases, and wildcards. Can include multiple search terms to refine results based on title, abstract, authors, comments, journal reference, subject category, or report number.\"\n",
+ " },\n",
+ " \"max_results\": {\n",
+ " \"type\": \"integer\",\n",
+ " \"description\": \"The maximum number of paper results to return from the ArXiv search. Aims to minimize the number of results while ensuring sufficient coverage of the topic. Defaults to 5 if not specified. Increasing this value broadens the search but may increase processing time and resource usage. Aim to be below 10 articles.\"\n",
+ " }\n",
+ " },\n",
+ " \"required\": [\"query\", \"max_results\"]\n",
+ " }\n",
+ " }]\n",
+ "\n",
+ " system_prompt = \"\"\"You are an expert at generating ArXiv queries. Use the prepare_arxiv_search tool to create an optimal query and determine the appropriate maximum number of results for the given research question. The query should utilize advanced search techniques including Boolean operators, field-specific syntax, and precise terms to ensure comprehensive yet focused results.\"\"\"\n",
+ "\n",
+ " messages = [\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": f\"Use the prepare_arxiv_search tool to generate an optimal ArXiv query and determine the maximum number of results for the following research instruction: {instruction}\"\n",
+ " }\n",
+ " ]\n",
+ "\n",
+ " response = anthropic_client.messages.create(\n",
+ " model=model,\n",
+ " max_tokens=4096,\n",
+ " messages=messages,\n",
+ " system=system_prompt,\n",
+ " tools=tools\n",
+ " )\n",
+ "\n",
+ " # Extract the query and max_results from the response\n",
+ " for content in response.content:\n",
+ " if content.type == 'tool_use' and content.name == 'prepare_arxiv_search':\n",
+ " args = content.input\n",
+ " return args.get('query'), args.get('max_results')\n",
+ "\n",
+ " # If no tool use was found, return a default query and the provided max_results\n",
+ " return f\"{instruction}\", 5"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "XFAtW11NdPeF",
+ "outputId": "840da560-ffb8-4ea9-be73-a5f8dfb6106b"
+ },
+ "outputs": [],
+ "source": [
+ "# instruction = \"Answer the following question: What are the latest advancements in Agentic LLMs?\"\n",
+ "# arxiv_query, max_results = generate_arxiv_query_args(instruction)\n",
+ "# print(f\"ArXiv query: {arxiv_query}\")\n",
+ "# print(f\"Max results: {max_results}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "id": "38kxv-ZSdPeF"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def fetch_arxiv_papers(query, max_results=5):\n",
+ " # Initialize the arxiv Client\n",
+ " arxiv_client = arxiv.Client()\n",
+ "\n",
+ " # Create the search object\n",
+ " search = arxiv.Search(\n",
+ " query=query,\n",
+ " max_results=max_results,\n",
+ " sort_by=arxiv.SortCriterion.Relevance,\n",
+ " sort_order=arxiv.SortOrder.Descending\n",
+ " )\n",
+ "\n",
+ " # Fetch the results using client.results() and convert them to ArxivPaper objects\n",
+ " papers = []\n",
+ " for result in arxiv_client.results(search):\n",
+ " paper = convert_raw_arxiv_to_pydantic(result)\n",
+ " papers.append(paper)\n",
+ "\n",
+ " return papers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "YWaCLZaSdPeF",
+ "outputId": "6bf3ed22-cd4a-4855-b5cc-f13843233bf7"
+ },
+ "outputs": [],
+ "source": [
+ "# arxiv_papers = fetch_arxiv_papers(arxiv_query, max_results)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QziEuY-cdPeG"
+ },
+ "source": [
+ "## Create a sample Arxiv paper object and load its PDF"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "aD17bwJIdPeG"
+ },
+ "source": [
+ "In this section, we demonstrate how to create a sample `ArxivPaper` object and load its corresponding PDF. This process is crucial for our summarization pipeline, as it provides both the metadata and the actual content of the paper.\n",
+ "\n",
+ "### Creating the ArxivPaper object\n",
+ "\n",
+ "The `ArxivPaper` class is a custom data structure that encapsulates various attributes of an arXiv paper, including:\n",
+ "\n",
+ "- `entry_id`: A unique identifier for the paper\n",
+ "- `updated` and `published`: Timestamps for when the paper was last updated and initially published\n",
+ "- `title`: The title of the paper\n",
+ "- `authors`: A list of `Author` objects representing the paper's authors\n",
+ "- `summary`: An abstract or brief description of the paper's content\n",
+ "- `doi`: The Digital Object Identifier for the paper\n",
+ "- `categories`: The arXiv categories the paper belongs to\n",
+ "- `links`: Various URLs associated with the paper, including its abstract and PDF\n",
+ "- `pdf_url`: A direct link to the paper's PDF\n",
+ "\n",
+ "In the code snippet below, we create an `ArxivPaper` object for a paper titled \"CRAG -- Comprehensive RAG Benchmark\". This paper discusses a new benchmark for Retrieval-Augmented Generation (RAG) systems, which is highly relevant to our summarization task.\n",
+ "\n",
+ "### Loading the PDF\n",
+ "\n",
+ "After creating the `ArxivPaper` object, we use the `load_pdf` function to fetch and load the actual PDF content. This function:\n",
+ "\n",
+ "1. Retrieves the PDF URL from the `ArxivPaper` object\n",
+ "2. Downloads the PDF content using the `requests` library\n",
+ "3. Creates a `BytesIO` object from the downloaded content\n",
+ "4. Uses `PyPDF2.PdfReader` to create a PDF reader object\n",
+ "\n",
+ "The `load_pdf` function allows us to work with the actual content of the paper, which is essential for our summarization task.\n",
+ "\n",
+ "By using this sample object and loading its PDF, we can proceed with our chain of density summarization process and evaluate its performance on a known, controlled input. This approach helps in debugging, fine-tuning, and showcasing the capabilities of our summarization pipeline."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "id": "u8bLDJpVdPeG"
+ },
+ "outputs": [],
+ "source": [
+ "arxiv_paper = ArxivPaper(\n",
+ " entry_id=\"http://arxiv.org/abs/2406.04744v1\",\n",
+ " updated=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),\n",
+ " published=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),\n",
+ " title=\"CRAG -- Comprehensive RAG Benchmark\",\n",
+ " authors=[\n",
+ " Author(full_name=\"Xiao Yang\"),\n",
+ " Author(full_name=\"Kai Sun\"),\n",
+ " Author(full_name=\"Hao Xin\"),\n",
+ " Author(full_name=\"Yushi Sun\"),\n",
+ " Author(full_name=\"Nikita Bhalla\"),\n",
+ " Author(full_name=\"Xiangsen Chen\"),\n",
+ " Author(full_name=\"Sajal Choudhary\"),\n",
+ " Author(full_name=\"Rongze Daniel Gui\"),\n",
+ " Author(full_name=\"Ziran Will Jiang\"),\n",
+ " Author(full_name=\"Ziyu Jiang\"),\n",
+ " Author(full_name=\"Lingkun Kong\"),\n",
+ " Author(full_name=\"Brian Moran\"),\n",
+ " Author(full_name=\"Jiaqi Wang\"),\n",
+ " Author(full_name=\"Yifan Ethan Xu\"),\n",
+ " Author(full_name=\"An Yan\"),\n",
+ " Author(full_name=\"Chenyu Yang\"),\n",
+ " Author(full_name=\"Eting Yuan\"),\n",
+ " Author(full_name=\"Hanwen Zha\"),\n",
+ " Author(full_name=\"Nan Tang\"),\n",
+ " Author(full_name=\"Lei Chen\"),\n",
+ " Author(full_name=\"Nicolas Scheffer\"),\n",
+ " Author(full_name=\"Yue Liu\"),\n",
+ " Author(full_name=\"Nirav Shah\"),\n",
+ " Author(full_name=\"Rakesh Wanga\"),\n",
+ " Author(full_name=\"Anuj Kumar\"),\n",
+ " Author(full_name=\"Wen-tau Yih\"),\n",
+ " Author(full_name=\"Xin Luna Dong\")\n",
+ " ],\n",
+ " summary=\"Retrieval-Augmented Generation (RAG) has recently emerged as a promising solution to alleviate Large Language Model (LLM)'s deficiency in lack of knowledge. Existing RAG datasets, however, do not adequately represent the diverse and dynamic nature of real-world Question Answering (QA) tasks. To bridge this gap, we introduce the Comprehensive RAG Benchmark (CRAG), a factual question answering benchmark of 4,409 question-answer pairs and mock APIs to simulate web and Knowledge Graph (KG) search. CRAG is designed to encapsulate a diverse array of questions across five domains and eight question categories, reflecting varied entity popularity from popular to long-tail, and temporal dynamisms ranging from years to seconds. Our evaluation on this benchmark highlights the gap to fully trustworthy QA. Whereas most advanced LLMs achieve <=34% accuracy on CRAG, adding RAG in a straightforward manner improves the accuracy only to 44%. State-of-the-art industry RAG solutions only answer 63% questions without any hallucination. CRAG also reveals much lower accuracy in answering questions regarding facts with higher dynamism, lower popularity, or higher complexity, suggesting future research directions. The CRAG benchmark laid the groundwork for a KDD Cup 2024 challenge, attracting thousands of participants and submissions within the first 50 days of the competition. We commit to maintaining CRAG to serve research communities in advancing RAG solutions and general QA solutions.\",\n",
+ " comment=\"\",\n",
+ " journal_ref=None,\n",
+ " doi=\"10.48550/arXiv.2406.04744\",\n",
+ " primary_category=\"cs.CL\",\n",
+ " categories=[\"cs.CL\"],\n",
+ " links=[\n",
+ " Link(href=\"https://arxiv.org/abs/2406.04744\", title=\"Abstract\", rel=\"alternate\", content_type=None),\n",
+ " Link(href=\"https://arxiv.org/pdf/2406.04744\", title=\"pdf\", rel=\"related\", content_type=None)\n",
+ " ],\n",
+ " pdf_url=\"https://arxiv.org/pdf/2406.04744\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# LLama 3.1 Paper as an example too\n",
+ "# arxiv_paper = ArxivPaper(\n",
+ "# entry_id=\"http://ai.meta.com/research/publications/the-llama-3-herd-of-models\",\n",
+ "# updated=datetime(2024, 7, 23, 0, 0, 0, tzinfo=timezone.utc),\n",
+ "# published=datetime(2024, 7, 23, 0, 0, 0, tzinfo=timezone.utc),\n",
+ "# title=\"The Llama 3 Herd of Models\",\n",
+ "# authors=[\n",
+ "# Author(full_name=\"Llama team\")\n",
+ "# ],\n",
+ "# summary=\"This paper presents Llama 3, a new set of foundation models that natively support multilinguality, coding, reasoning, and tool usage. The largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. Llama 3 delivers comparable quality to leading language models such as GPT-4 on various tasks. The paper includes extensive empirical evaluation and discusses experiments integrating image, video, and speech capabilities via a compositional approach. The authors publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and the Llama Guard 3 model for input and output safety.\",\n",
+ "# comment=\"\",\n",
+ "# journal_ref=None,\n",
+ "# doi=None,\n",
+ "# primary_category=\"cs.CL\",\n",
+ "# categories=[\"cs.CL\", \"cs.AI\", \"cs.CV\", \"cs.SD\"],\n",
+ "# links=[\n",
+ "# Link(href=\"https://ai.meta.com/research/publications/the-llama-3-herd-of-models/\", title=\"Abstract\", rel=\"alternate\", content_type=None),\n",
+ "# Link(href=\"https://scontent-iad3-1.xx.fbcdn.net/v/t39.2365-6/452387774_1036916434819166_4173978747091533306_n.pdf?_nc_cat=104&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=t6egZJ8QdI4Q7kNvgFlW62W&_nc_ht=scontent-iad3-1.xx&oh=00_AYC5wOxvDl7FhvRYoGpOweAZW11hT0hOR3Mlsk0d5kIpTA&oe=66A83D0D\", title=\"pdf\", rel=\"related\", content_type=\"application/pdf\")\n",
+ "# ],\n",
+ "# pdf_url=\"https://scontent-iad3-1.xx.fbcdn.net/v/t39.2365-6/452387774_1036916434819166_4173978747091533306_n.pdf?_nc_cat=104&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=t6egZJ8QdI4Q7kNvgFlW62W&_nc_ht=scontent-iad3-1.xx&oh=00_AYC5wOxvDl7FhvRYoGpOweAZW11hT0hOR3Mlsk0d5kIpTA&oe=66A83D0D\"\n",
+ "# )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 36
+ },
+ "id": "oL0_q6XWdPeG",
+ "outputId": "842f8fc8-7f31-48de-83db-b9985b9893c7"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'https://scontent-iad3-1.xx.fbcdn.net/v/t39.2365-6/452387774_1036916434819166_4173978747091533306_n.pdf?_nc_cat=104&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=t6egZJ8QdI4Q7kNvgFlW62W&_nc_ht=scontent-iad3-1.xx&oh=00_AYC5wOxvDl7FhvRYoGpOweAZW11hT0hOR3Mlsk0d5kIpTA&oe=66A83D0D'"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "arxiv_paper.pdf_url"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "id": "WUbtqoyedPeG"
+ },
+ "outputs": [],
+ "source": [
+ "def load_pdf(arxiv_result):\n",
+ " pdf_url = arxiv_result[\"pdf_url\"]\n",
+ " response = requests.get(pdf_url)\n",
+ " pdf_file = io.BytesIO(response.content)\n",
+ " pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
+ " return pdf_reader"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "IMHuvs7XdPeG"
+ },
+ "source": [
+ "## Convert Images to Text using Sonnet's vision capabilities\n",
+ "\n",
+ "In this section, we leverage Claude 3 Sonnet's advanced vision capabilities to convert images from ArXiv PDFs into detailed textual descriptions. This process is crucial for creating a comprehensive text-based representation of the entire paper, including figures and diagrams.\n",
+ "\n",
+ "### Key Components:\n",
+ "\n",
+ "1. **Vector Graphic Conversion**:\n",
+ " - The `convert_vector_graphic_page_to_image` function handles vector graphics in PDFs, converting them to PNG images for further processing.\n",
+ " - This step is essential for capturing complex diagrams and charts that are often present in scientific papers.\n",
+ " - If direct image extraction is not possible (e.g., for SVGs or other vector graphics), the function converts the entire page to an image.\n",
+ " - In such cases, the LLM is instructed to focus solely on describing the images on the page, ignoring any text content.\n",
+ "\n",
+ "2. **Image Processing**:\n",
+ " - Two main functions, `process_figure_image` and `process_vector_image_pdf`, utilize Claude 3 Sonnet to analyze and describe images.\n",
+ " - `process_figure_image` focuses on individual figures, providing detailed technical descriptions.\n",
+ " - `process_vector_image_pdf` handles full PDF pages that may contain multiple vector graphics.\n",
+ "\n",
+ "3. **Image Extraction and Description**:\n",
+ " - The `extract_images` function iterates through PDF pages, extracting both raster images and vector graphics.\n",
+ " - It calls the appropriate processing function for each image type, generating textual descriptions.\n",
+ "\n",
+ "4. **Text Integration**:\n",
+ " - `replace_images_with_descriptions` combines the extracted text from the PDF with the generated image descriptions.\n",
+ " - This creates a unified text document that includes both the original text and detailed descriptions of all visual elements.\n",
+ "\n",
+ "By converting images to text, we ensure that the chain of density summarization process can incorporate information from all aspects of the paper, including visual data. This comprehensive approach allows for more accurate and informative summaries, especially for papers with significant visual content."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "id": "YFZkmB6TdPeG"
+ },
+ "outputs": [],
+ "source": [
+ "def convert_vector_graphic_page_to_image(pdf_page, scale_factor=0.5):\n",
+ " def get_object(obj):\n",
+ " if isinstance(obj, PyPDF2.generic.IndirectObject):\n",
+ " return obj.get_object()\n",
+ " return obj\n",
+ "\n",
+ " resources = get_object(pdf_page.get('/Resources', {}))\n",
+ " xobject = get_object(resources.get('/XObject', {}))\n",
+ "\n",
+ " # Check if there's a figure that's not an image\n",
+ " if xobject:\n",
+ " for obj in xobject.values():\n",
+ " obj = get_object(obj)\n",
+ " if isinstance(obj, dict) and obj.get('/Subtype') == '/Form': # This indicates a vector graphic\n",
+ " # Convert the page to a PIL Image\n",
+ " pdf_bytes = io.BytesIO()\n",
+ " pdf_writer = PyPDF2.PdfWriter()\n",
+ " pdf_writer.add_page(pdf_page)\n",
+ " pdf_writer.write(pdf_bytes)\n",
+ " pdf_bytes.seek(0)\n",
+ "\n",
+ " # Convert PDF to image\n",
+ " images = convert_from_bytes(pdf_bytes.getvalue(), fmt='png')\n",
+ "\n",
+ " if images:\n",
+ " image = images[0]\n",
+ " # Resize the image\n",
+ " new_size = (int(image.width * scale_factor), int(image.height * scale_factor))\n",
+ " image = image.resize(new_size, Image.LANCZOS)\n",
+ " img_byte_arr = io.BytesIO()\n",
+ " image.save(img_byte_arr, format='PNG')\n",
+ " img_byte_arr = img_byte_arr.getvalue()\n",
+ " img_str = base64.b64encode(img_byte_arr).decode(\"utf-8\")\n",
+ " data_url = f\"data:image/png;base64,{img_str}\"\n",
+ " return data_url\n",
+ "\n",
+ " return None # Return None if no conversion was needed"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "id": "F80m-Rv5dPeH"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def process_figure_image(data_url, model=\"claude-3-5-sonnet-20240620\"):\n",
+ " \"\"\"Process image data and return a detailed technical description.\"\"\"\n",
+ " img_str = data_url.split(\",\")[1]\n",
+ "\n",
+ " response = anthropic_client.messages.create(\n",
+ " model=model,\n",
+ " max_tokens=4096,\n",
+ " messages=[\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": [\n",
+ " {\n",
+ " \"type\": \"image\",\n",
+ " \"source\": {\n",
+ " \"type\": \"base64\",\n",
+ " \"media_type\": \"image/png\",\n",
+ " \"data\": img_str,\n",
+ " },\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"text\",\n",
+ " \"text\": \"\"\"Analyze this image as if it's a figure from a scientific research paper. Provide a detailed technical description addressing the following:\n",
+ "\n",
+ "1. Type of figure (e.g., graph, diagram, flowchart, experimental setup)\n",
+ "2. Key components or variables represented\n",
+ "3. Relationships or trends depicted\n",
+ "4. Quantitative information (if present)\n",
+ "5. Methodology or process illustrated (if applicable)\n",
+ "6. Potential implications or conclusions that can be drawn\n",
+ "7. Any limitations or assumptions evident in the figure\n",
+ "\n",
+ "Focus on technical accuracy and relevance to scientific research. Avoid general descriptions and concentrate on the specific scientific content presented.\"\"\",\n",
+ " },\n",
+ " ],\n",
+ " }\n",
+ " ],\n",
+ " )\n",
+ " return response.content[0].text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "id": "-CCr7EEtdPeH"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def process_vector_image_pdf(data_url, model=\"claude-3-5-sonnet-20240620\"):\n",
+ " img_str = data_url.split(\",\")[1]\n",
+ "\n",
+ " response = anthropic_client.messages.create(\n",
+ " model=model,\n",
+ " max_tokens=4096,\n",
+ " messages=[\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": [\n",
+ " {\n",
+ " \"type\": \"image\",\n",
+ " \"source\": {\n",
+ " \"type\": \"base64\",\n",
+ " \"media_type\": \"image/png\",\n",
+ " \"data\": img_str,\n",
+ " },\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"text\",\n",
+ " \"text\": \"\"\"This image is a full page from a scientific paper PDF, converted to PNG format. It may contain one or more vector graphic figures or charts. Your task is to:\n",
+ "\n",
+ "1. Identify and focus solely on the vector graphic figures or charts within the page.\n",
+ "2. For each identified figure or chart, provide a detailed technical analysis addressing:\n",
+ "\n",
+ " a. Type of figure (e.g., graph, diagram, flowchart)\n",
+ " b. Key components or variables represented\n",
+ " c. Relationships or trends depicted\n",
+ " d. Quantitative information (if present)\n",
+ " e. Methodology or process illustrated (if applicable)\n",
+ " f. Potential implications or conclusions that can be drawn\n",
+ "\n",
+ "3. Ignore any text or other elements on the page that are not part of the vector graphic figures.\n",
+ "4. If multiple figures are present, analyze each separately and clearly indicate which figure you are describing.\n",
+ "\n",
+ "Focus on providing accurate, technical descriptions of the vector graphic content only.\"\"\",\n",
+ " },\n",
+ " ],\n",
+ " }\n",
+ " ],\n",
+ " )\n",
+ " return response.content[0].text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "id": "XoyL0_HudPeH"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def extract_images(paper, model=\"claude-3-5-sonnet-20240620\"):\n",
+ " \"\"\"Extract text and images from PDF content.\"\"\"\n",
+ " pdf_reader = load_pdf(paper)\n",
+ " all_images = []\n",
+ "\n",
+ " aspect_ratio_sizes = {\n",
+ " (1, 1): (1092, 1092),\n",
+ " (3, 4): (951, 1268),\n",
+ " (2, 3): (896, 1344),\n",
+ " (9, 16): (819, 1456),\n",
+ " (1, 2): (784, 1568)\n",
+ " }\n",
+ "\n",
+ " def get_closest_aspect_ratio(width, height):\n",
+ " img_ratio = width / height\n",
+ " return min(aspect_ratio_sizes.keys(), key=lambda x: abs(x[0]/x[1] - img_ratio))\n",
+ "\n",
+ " for page in pdf_reader.pages:\n",
+ " images = []\n",
+ "\n",
+ " for image in page.images:\n",
+ " img_data = image.data\n",
+ " kind = filetype.guess(img_data)\n",
+ " if kind is None:\n",
+ " print(\"Cannot guess file type!\")\n",
+ " continue\n",
+ "\n",
+ " # Resize image if necessary\n",
+ " img = Image.open(io.BytesIO(img_data))\n",
+ " closest_ratio = get_closest_aspect_ratio(img.width, img.height)\n",
+ " new_size = aspect_ratio_sizes[closest_ratio]\n",
+ " \n",
+ " if img.width != new_size[0] or img.height != new_size[1]:\n",
+ " img = img.resize(new_size, Image.LANCZOS)\n",
+ " \n",
+ " # Convert resized image back to bytes\n",
+ " img_byte_arr = io.BytesIO()\n",
+ " img.save(img_byte_arr, format=img.format if img.format else 'PNG')\n",
+ " img_data = img_byte_arr.getvalue()\n",
+ "\n",
+ "\n",
+ " img_str = base64.b64encode(img_data).decode(\"utf-8\")\n",
+ " data_url = f\"data:{kind.mime};base64,{img_str}\"\n",
+ " try:\n",
+ " images.append(\n",
+ " {\"image\": data_url, \"description\": process_figure_image(data_url, model=model)}\n",
+ " )\n",
+ " except Exception as e:\n",
+ " print(f\"Error processing image: {e}\")\n",
+ " images.append({\"image\": data_url, \"description\": \"\"})\n",
+ "\n",
+ " vector_graphics_image_data_url = convert_vector_graphic_page_to_image(page)\n",
+ " if vector_graphics_image_data_url:\n",
+ " images.append({\"image\": vector_graphics_image_data_url, \"description\": process_vector_image_pdf(vector_graphics_image_data_url, model=model)})\n",
+ " all_images.append(images)\n",
+ "\n",
+ " return all_images"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "id": "wVFvk4uWdPeH"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def replace_images_with_descriptions(paper, images):\n",
+ " pdf_reader = load_pdf(paper)\n",
+ " text = \"\"\n",
+ " for page_num, page in enumerate(pdf_reader.pages):\n",
+ " text += page.extract_text() + \"\\n\\n\"\n",
+ " if page_num >= len(images):\n",
+ " continue\n",
+ " if images[page_num] and len(images[page_num]) > 0:\n",
+ " text += f\"\\n\\n[Image Descriptions for page {page_num+1}]\\n\"\n",
+ " for image_num, image in enumerate(images[page_num]):\n",
+ " text += f\"\\n[Image {image_num+1}]: {image['description']}\\n\"\n",
+ " text += \"[END OF IMAGE DESCRIPTIONS]\\n\"\n",
+ "\n",
+ " return text"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ICr4VrMDdPeH"
+ },
+ "source": [
+ "## Chain of Density Summarization\n",
+ "\n",
+ "The Chain of Density (CoD) summarization technique is a powerful method for creating increasingly dense and informative summaries. In this section, we'll explore how to implement CoD for ArXiv PDF summarization, including specific preprocessing and postprocessing steps to evaluate the model's performance.\n",
+ "\n",
+ "Chain of Density is an iterative approach to summarization that progressively refines and condenses information. The process involves several key steps:\n",
+ "\n",
+ "1. **Initial Summarization**: Starting with the full document, the `summarize_current_summary` function creates an initial summary focused on a specific instruction.\n",
+ "\n",
+ "2. **Iterative Refinement**: The `iterative_density_summarization` function repeatedly calls `summarize_current_summary`, each time taking the previous summary as input. This process:\n",
+ " - Identifies new, important technical entities or ideas from the original text\n",
+ " - Incorporates these new elements into the summary\n",
+ " - Increases overall information density while maintaining focus on the instruction\n",
+ "\n",
+ "3. **Final Condensation**: After multiple iterations, the `final_summary` function creates an extremely dense summary, aiming to reduce length by 30-40% while retaining all critical technical content.\n",
+ "\n",
+ "The `chain_of_density_summarization` function orchestrates this entire process:\n",
+ "\n",
+ "```python\n",
+ "@weave.op()\n",
+ "def chain_of_density_summarization(document, instruction, current_summary=\"\", model=\"claude-3-5-sonnet-20240620\", density_iterations=2):\n",
+ " current_summary, iteration_summaries = iterative_density_summarization(document, instruction, current_summary, density_iterations, model)\n",
+ " final_summary_text = final_summary(instruction, current_summary, model)\n",
+ " print(f\"Final Summary:\\n{final_summary_text}\\n\")\n",
+ "\n",
+ " return {\n",
+ " \"final_summary\": final_summary_text,\n",
+ " \"accumulated_summary\": current_summary,\n",
+ " \"iteration_summaries\": iteration_summaries,\n",
+ " }\n",
+ "```\n",
+ "\n",
+ "This function takes the preprocessed document, a specific instruction to focus on, the model to use, and the number of density iterations. It returns a dictionary containing:\n",
+ "\n",
+ "- The final, highly condensed summary\n",
+ "- The accumulated summary from all iterations\n",
+ "- Individual summaries from each iteration\n",
+ "\n",
+ "By using this approach, Chain of Density creates summaries that are progressively more concise, technically precise, and information-dense, while remaining focused on the specific instruction provided. This makes it particularly well-suited for summarizing complex technical documents like ArXiv papers, where maintaining accuracy and depth of information is crucial."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "id": "JvQS7-zfdPeH"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def summarize_current_summary(document, instruction, current_summary=\"\", iteration=1, model=\"claude-3-5-sonnet-20240620\"):\n",
+ " max_tokens = 4096 # Adjust this value based on the model's context window\n",
+ "\n",
+ " prompt = f\"\"\"\n",
+ " Document:\n",
+ " {document}\n",
+ "\n",
+ " Current summary:\n",
+ " {current_summary}\n",
+ "\n",
+ " Instruction to focus on: {instruction}\n",
+ "\n",
+ " Iteration: {iteration}\n",
+ "\n",
+ " Generate an increasingly concise, entity-dense, and highly technical summary from the provided document that specifically addresses the given instruction using the below approach:\n",
+ "\n",
+ " 1. Carefully read the current summary and the instruction.\n",
+ "\n",
+ " 2. Identify 1-3 new, important technical entities or ideas from the original text that:\n",
+ " - Are directly relevant to the instruction\n",
+ " - Are not yet present in the current summary\n",
+ " - Add significant, specific information to the summary\n",
+ " - Are preferably 5 words or fewer\n",
+ " - May include methodologies, algorithms, metrics, or key findings\n",
+ " - Ensure to include this in the output before the summary\n",
+ "\n",
+ " 3. Write a new summary that:\n",
+ " - Incorporates the newly identified entities/ideas\n",
+ " - Retains all crucial information from the current summary\n",
+ " - Increases overall information density\n",
+ " - Remains focused on addressing the instruction\n",
+ " - Utilizes the response window of {max_tokens} tokens\n",
+ "\n",
+ " Guidelines:\n",
+ " - Prioritize technical accuracy and specificity over general readability\n",
+ " - Use precise terminology, domain-specific jargon, and include quantitative details where relevant\n",
+ " - Ensure all information is directly related to the instruction\n",
+ " - Make every word count: rewrite to improve density and make space for new technical entities\n",
+ " - Employ fusion, compression, and removal of less informative phrases to increase density\n",
+ " - Never drop entities or technical details from the current summary that are relevant to the instruction\n",
+ " - Maintain coherence while maximizing information density\n",
+ "\n",
+ " Your goal is to create a summary that is noticeably denser, more technical, and more informative than the previous one, utilizing the response window of {max_tokens} tokens while staying laser-focused on the instruction. The summary should be suitable for an expert audience in the field.\"\"\"\n",
+ "\n",
+ " response = anthropic_client.messages.create(\n",
+ " model=model,\n",
+ " max_tokens=max_tokens,\n",
+ " messages=[{\"role\": \"user\", \"content\": prompt}]\n",
+ " )\n",
+ " return response.content[0].text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "id": "fV0lFX-jdPeI"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def iterative_density_summarization(document, instruction, current_summary, density_iterations, model):\n",
+ " iteration_summaries = []\n",
+ " for iteration in range(1, density_iterations + 1):\n",
+ " current_summary = summarize_current_summary(document, instruction, current_summary, iteration, model)\n",
+ " iteration_summaries.append(current_summary)\n",
+ " print(f\"Iteration {iteration}:\\n{current_summary}\\n\")\n",
+ " return current_summary, iteration_summaries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "id": "bgUKuo4RdPeI"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def final_summary(instruction, current_summary, model):\n",
+ " return anthropic_client.messages.create(\n",
+ " model=model,\n",
+ " max_tokens=4096,\n",
+ " messages=[\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": f\"\"\"Given this summary:\n",
+ "\n",
+ "{current_summary}\n",
+ "\n",
+ "And this instruction to focus on:\n",
+ "\n",
+ "{instruction}\n",
+ "\n",
+ "Create an extremely dense, final summary that captures all key technical information in the most concise form possible, while specifically addressing the given instruction. Follow these guidelines:\n",
+ "\n",
+ "1. Aim to reduce length by 30-40% while retaining all critical technical content relevant to the instruction.\n",
+ "2. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.\n",
+ "3. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.\n",
+ "4. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.\n",
+ "5. Ensure that all key entities and concepts from the original summary that relate to the instruction are represented.\n",
+ "6. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.\n",
+ "7. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints.\n",
+ "8. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.\n",
+ "\n",
+ "The final summary should be a highly concentrated, technical distillation of the research that specifically addresses the given instruction, suitable for specialists in the field.\"\"\",\n",
+ " }\n",
+ " ],\n",
+ " ).content[0].text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "id": "Q00DwNZYdPeI"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def chain_of_density_summarization(document, instruction, current_summary=\"\", model=\"claude-3-5-sonnet-20240620\", density_iterations=2):\n",
+ " current_summary, iteration_summaries = iterative_density_summarization(document, instruction, current_summary, density_iterations, model)\n",
+ " final_summary_text = final_summary(instruction, current_summary, model)\n",
+ " print(f\"Final Summary:\\n{final_summary_text}\\n\")\n",
+ "\n",
+ " return {\n",
+ " \"final_summary\": final_summary_text,\n",
+ " \"accumulated_summary\": current_summary,\n",
+ " \"iteration_summaries\": iteration_summaries,\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-enc1pEodPeI"
+ },
+ "source": [
+ "## Create a Weave Model Object to better serialize the model for experimentation\n",
+ "\n",
+ "This section defines an `ArxivChainOfDensityPipeline` class that encapsulates our summarization pipeline as a `weave.Model`. Key features:\n",
+ "\n",
+ "- Configurable parameters: `model` and `density_iterations`\n",
+ "- `predict` method: Processes an `ArxivPaper` object and instruction through the entire pipeline\n",
+ "\n",
+ "The class structure enables easy serialization, parameter adjustment, and reproducibility of experiments. Usage example is provided for instantiation and prediction."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "id": "mHN4lPeOdPeI"
+ },
+ "outputs": [],
+ "source": [
+ "class ArxivChainOfDensityPipeline(weave.Model):\n",
+ "\n",
+ " model: str = \"claude-3-5-sonnet-20240620\"\n",
+ " density_iterations: int = 3\n",
+ "\n",
+ " def __init__(self, model: str = \"claude-3-5-sonnet-20240620\", density_iterations: int = 3):\n",
+ " super().__init__()\n",
+ " self.model = model\n",
+ " self.density_iterations = density_iterations\n",
+ "\n",
+ " @weave.op()\n",
+ " def predict(self, paper: ArxivPaper, instruction: str) -> dict:\n",
+ " extracted_images = extract_images(paper)\n",
+ " cleaned_text = replace_images_with_descriptions(paper, extracted_images)\n",
+ " result = chain_of_density_summarization(cleaned_text, instruction, model=self.model, density_iterations=self.density_iterations)\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OXt84krFdPeJ",
+ "outputId": "cc4cef66-7053-47d7-d74a-4755eae1a29b"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Iteration 1:\n",
+ "New entities/ideas:\n",
+ "1. Knowledge classification\n",
+ "2. Scaling law experiments\n",
+ "3. Global batch size\n",
+ "\n",
+ "Summary:\n",
+ "\n",
+ "The approach to experimenting with different data mixtures involved a two-stage methodology utilizing knowledge classification and scaling law experiments. To determine the optimal data mix, a classifier was developed to categorize the types of information contained in web data, enabling more effective downsampling of over-represented categories such as arts and entertainment.\n",
+ "\n",
+ "Scaling law experiments were conducted to predict the performance of large models based on small-scale trials. Multiple small models were trained on various data mixes, and their performance was used to forecast the behavior of larger models. This process was iterated to select new data mix candidates. Subsequently, a larger model was trained on the candidate mix and evaluated on key benchmarks.\n",
+ "\n",
+ "The final data mix composition consisted of approximately 50% general knowledge tokens, 25% mathematical and reasoning tokens, 17% code tokens, and 8% multilingual tokens. Training hyperparameters were carefully tuned, with global batch sizes ranging from 250K to 4M, depending on the compute scale. Peak learning rates were set between 2e-4 and 4e-4, adjusted based on model size. Weight decay was dynamically set to 0.1 times the learning rate at each step.\n",
+ "\n",
+ "This iterative approach to data mix optimization, combined with rigorous scaling law experiments and hyperparameter tuning, allowed for the development of a highly effective training dataset for the Llama 3 model family.\n",
+ "\n",
+ "Iteration 2:\n",
+ "New entities/ideas:\n",
+ "1. Data downsampling\n",
+ "2. Cosine learning rate schedule\n",
+ "3. Weight decay adjustment\n",
+ "\n",
+ "Summary:\n",
+ "\n",
+ "The approach to experimenting with different data mixtures involved a multifaceted methodology combining knowledge classification, scaling law experiments, and iterative optimization. A classifier was developed to categorize web data information types, enabling data downsampling of over-represented categories (e.g., arts and entertainment). This facilitated more effective determination of optimal data mix compositions.\n",
+ "\n",
+ "Scaling law experiments were conducted to predict large model performance based on small-scale trials. Multiple small models were trained on various data mixes, with their performance used to forecast larger model behavior. This process was iterated to select new data mix candidates. Larger models were subsequently trained on candidate mixes and evaluated on key benchmarks.\n",
+ "\n",
+ "The final data mix composition was optimized to approximately 50% general knowledge tokens, 25% mathematical and reasoning tokens, 17% code tokens, and 8% multilingual tokens. Training hyperparameters were meticulously tuned, employing a cosine learning rate schedule with initial rates between 2e-4 and 4e-4, adjusted based on model size. Weight decay adjustment was implemented dynamically, set to 0.1 times the learning rate at each step.\n",
+ "\n",
+ "Global batch sizes were scaled from 250K to 4M, contingent on compute scale. This adaptive approach to batch sizing, combined with the cosine learning rate schedule and weight decay adjustment, allowed for efficient training across different model scales and computational resources.\n",
+ "\n",
+ "The iterative process of data mix optimization, underpinned by rigorous scaling law experiments and hyperparameter tuning, enabled the development of a highly effective training dataset for the Llama 3 model family. This methodology allowed for continuous refinement of the data mixture, ensuring optimal performance across various model sizes and computational constraints.\n",
+ "\n",
+ "Iteration 3:\n",
+ "New entities/ideas:\n",
+ "1. Knowledge classification\n",
+ "2. Over-represented category downsampling\n",
+ "3. On-policy data collection\n",
+ "\n",
+ "Summary:\n",
+ "\n",
+ "The approach to experimenting with different data mixtures involved a multifaceted methodology combining knowledge classification, scaling law experiments, and iterative optimization. A classifier was developed to categorize web data information types, enabling over-represented category downsampling (e.g., arts and entertainment). This facilitated more effective determination of optimal data mix compositions.\n",
+ "\n",
+ "Scaling law experiments were conducted to predict large model performance based on small-scale trials. Multiple small models were trained on various data mixes, with their performance used to forecast larger model behavior. This process was iterated to select new data mix candidates. Larger models were subsequently trained on candidate mixes and evaluated on key benchmarks, employing on-policy data collection for preference datasets in later rounds.\n",
+ "\n",
+ "The final data mix composition was optimized to approximately 50% general knowledge tokens, 25% mathematical and reasoning tokens, 17% code tokens, and 8% multilingual tokens. Training hyperparameters were meticulously tuned, employing a cosine learning rate schedule with initial rates between 2e-4 and 4e-4, adjusted based on model size. Weight decay adjustment was implemented dynamically, set to 0.1 times the learning rate at each step.\n",
+ "\n",
+ "Global batch sizes were scaled from 250K to 4M, contingent on compute scale. This adaptive approach to batch sizing, combined with the cosine learning rate schedule and weight decay adjustment, allowed for efficient training across different model scales and computational resources.\n",
+ "\n",
+ "The iterative process of data mix optimization, underpinned by rigorous scaling law experiments and hyperparameter tuning, enabled the development of a highly effective training dataset for the Llama 3 model family. This methodology allowed for continuous refinement of the data mixture, ensuring optimal performance across various model sizes and computational constraints.\n",
+ "\n",
+ "Final Summary:\n",
+ "The approach to experimenting with different data mixtures involved:\n",
+ "\n",
+ "1. Knowledge classification: Developed classifier to categorize web data information types.\n",
+ "2. Over-represented category downsampling: Reduced overrepresented categories (e.g., arts/entertainment) for optimal mix composition.\n",
+ "3. Scaling law experiments: Trained small models on various mixes to predict large model performance.\n",
+ "4. Iterative optimization: Used small-scale predictions to select new mix candidates for larger models.\n",
+ "5. On-policy data collection: Employed for preference datasets in later rounds.\n",
+ "6. Benchmark evaluation: Assessed larger models trained on candidate mixes.\n",
+ "\n",
+ "Final optimized mix: ~50% general knowledge, 25% math/reasoning, 17% code, 8% multilingual tokens.\n",
+ "\n",
+ "Hyperparameter tuning: Cosine learning rate schedule (initial 2e-4 to 4e-4), dynamic weight decay (0.1 * learning rate), global batch sizes 250K-4M scaled with compute.\n",
+ "\n",
+ "This iterative process enabled continuous refinement across model sizes and computational constraints.\n",
+ "\n",
+ "🍩 https://wandb.ai/a-sh0ts/arxiv-chain-of-density-summarization-llama-questions/r/call/d36cb676-1f82-4418-8416-1fc61f0b779c\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'final_summary': 'The approach to experimenting with different data mixtures involved:\\n\\n1. Knowledge classification: Developed classifier to categorize web data information types.\\n2. Over-represented category downsampling: Reduced overrepresented categories (e.g., arts/entertainment) for optimal mix composition.\\n3. Scaling law experiments: Trained small models on various mixes to predict large model performance.\\n4. Iterative optimization: Used small-scale predictions to select new mix candidates for larger models.\\n5. On-policy data collection: Employed for preference datasets in later rounds.\\n6. Benchmark evaluation: Assessed larger models trained on candidate mixes.\\n\\nFinal optimized mix: ~50% general knowledge, 25% math/reasoning, 17% code, 8% multilingual tokens.\\n\\nHyperparameter tuning: Cosine learning rate schedule (initial 2e-4 to 4e-4), dynamic weight decay (0.1 * learning rate), global batch sizes 250K-4M scaled with compute.\\n\\nThis iterative process enabled continuous refinement across model sizes and computational constraints.',\n",
+ " 'accumulated_summary': 'New entities/ideas:\\n1. Knowledge classification\\n2. Over-represented category downsampling\\n3. On-policy data collection\\n\\nSummary:\\n\\nThe approach to experimenting with different data mixtures involved a multifaceted methodology combining knowledge classification, scaling law experiments, and iterative optimization. A classifier was developed to categorize web data information types, enabling over-represented category downsampling (e.g., arts and entertainment). This facilitated more effective determination of optimal data mix compositions.\\n\\nScaling law experiments were conducted to predict large model performance based on small-scale trials. Multiple small models were trained on various data mixes, with their performance used to forecast larger model behavior. This process was iterated to select new data mix candidates. Larger models were subsequently trained on candidate mixes and evaluated on key benchmarks, employing on-policy data collection for preference datasets in later rounds.\\n\\nThe final data mix composition was optimized to approximately 50% general knowledge tokens, 25% mathematical and reasoning tokens, 17% code tokens, and 8% multilingual tokens. Training hyperparameters were meticulously tuned, employing a cosine learning rate schedule with initial rates between 2e-4 and 4e-4, adjusted based on model size. Weight decay adjustment was implemented dynamically, set to 0.1 times the learning rate at each step.\\n\\nGlobal batch sizes were scaled from 250K to 4M, contingent on compute scale. This adaptive approach to batch sizing, combined with the cosine learning rate schedule and weight decay adjustment, allowed for efficient training across different model scales and computational resources.\\n\\nThe iterative process of data mix optimization, underpinned by rigorous scaling law experiments and hyperparameter tuning, enabled the development of a highly effective training dataset for the Llama 3 model family. This methodology allowed for continuous refinement of the data mixture, ensuring optimal performance across various model sizes and computational constraints.',\n",
+ " 'iteration_summaries': ['New entities/ideas:\\n1. Knowledge classification\\n2. Scaling law experiments\\n3. Global batch size\\n\\nSummary:\\n\\nThe approach to experimenting with different data mixtures involved a two-stage methodology utilizing knowledge classification and scaling law experiments. To determine the optimal data mix, a classifier was developed to categorize the types of information contained in web data, enabling more effective downsampling of over-represented categories such as arts and entertainment.\\n\\nScaling law experiments were conducted to predict the performance of large models based on small-scale trials. Multiple small models were trained on various data mixes, and their performance was used to forecast the behavior of larger models. This process was iterated to select new data mix candidates. Subsequently, a larger model was trained on the candidate mix and evaluated on key benchmarks.\\n\\nThe final data mix composition consisted of approximately 50% general knowledge tokens, 25% mathematical and reasoning tokens, 17% code tokens, and 8% multilingual tokens. Training hyperparameters were carefully tuned, with global batch sizes ranging from 250K to 4M, depending on the compute scale. Peak learning rates were set between 2e-4 and 4e-4, adjusted based on model size. Weight decay was dynamically set to 0.1 times the learning rate at each step.\\n\\nThis iterative approach to data mix optimization, combined with rigorous scaling law experiments and hyperparameter tuning, allowed for the development of a highly effective training dataset for the Llama 3 model family.',\n",
+ " 'New entities/ideas:\\n1. Data downsampling\\n2. Cosine learning rate schedule\\n3. Weight decay adjustment\\n\\nSummary:\\n\\nThe approach to experimenting with different data mixtures involved a multifaceted methodology combining knowledge classification, scaling law experiments, and iterative optimization. A classifier was developed to categorize web data information types, enabling data downsampling of over-represented categories (e.g., arts and entertainment). This facilitated more effective determination of optimal data mix compositions.\\n\\nScaling law experiments were conducted to predict large model performance based on small-scale trials. Multiple small models were trained on various data mixes, with their performance used to forecast larger model behavior. This process was iterated to select new data mix candidates. Larger models were subsequently trained on candidate mixes and evaluated on key benchmarks.\\n\\nThe final data mix composition was optimized to approximately 50% general knowledge tokens, 25% mathematical and reasoning tokens, 17% code tokens, and 8% multilingual tokens. Training hyperparameters were meticulously tuned, employing a cosine learning rate schedule with initial rates between 2e-4 and 4e-4, adjusted based on model size. Weight decay adjustment was implemented dynamically, set to 0.1 times the learning rate at each step.\\n\\nGlobal batch sizes were scaled from 250K to 4M, contingent on compute scale. This adaptive approach to batch sizing, combined with the cosine learning rate schedule and weight decay adjustment, allowed for efficient training across different model scales and computational resources.\\n\\nThe iterative process of data mix optimization, underpinned by rigorous scaling law experiments and hyperparameter tuning, enabled the development of a highly effective training dataset for the Llama 3 model family. This methodology allowed for continuous refinement of the data mixture, ensuring optimal performance across various model sizes and computational constraints.',\n",
+ " 'New entities/ideas:\\n1. Knowledge classification\\n2. Over-represented category downsampling\\n3. On-policy data collection\\n\\nSummary:\\n\\nThe approach to experimenting with different data mixtures involved a multifaceted methodology combining knowledge classification, scaling law experiments, and iterative optimization. A classifier was developed to categorize web data information types, enabling over-represented category downsampling (e.g., arts and entertainment). This facilitated more effective determination of optimal data mix compositions.\\n\\nScaling law experiments were conducted to predict large model performance based on small-scale trials. Multiple small models were trained on various data mixes, with their performance used to forecast larger model behavior. This process was iterated to select new data mix candidates. Larger models were subsequently trained on candidate mixes and evaluated on key benchmarks, employing on-policy data collection for preference datasets in later rounds.\\n\\nThe final data mix composition was optimized to approximately 50% general knowledge tokens, 25% mathematical and reasoning tokens, 17% code tokens, and 8% multilingual tokens. Training hyperparameters were meticulously tuned, employing a cosine learning rate schedule with initial rates between 2e-4 and 4e-4, adjusted based on model size. Weight decay adjustment was implemented dynamically, set to 0.1 times the learning rate at each step.\\n\\nGlobal batch sizes were scaled from 250K to 4M, contingent on compute scale. This adaptive approach to batch sizing, combined with the cosine learning rate schedule and weight decay adjustment, allowed for efficient training across different model scales and computational resources.\\n\\nThe iterative process of data mix optimization, underpinned by rigorous scaling law experiments and hyperparameter tuning, enabled the development of a highly effective training dataset for the Llama 3 model family. This methodology allowed for continuous refinement of the data mixture, ensuring optimal performance across various model sizes and computational constraints.']}"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline()\n",
+ "arxiv_chain_of_density_pipeline.predict(arxiv_paper, \"\"\"\n",
+ "What was the approach to experimenting with different data mixtures\n",
+ "\"\"\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "y0p9XbaRdPeJ"
+ },
+ "source": [
+ "## Create our Evaluation Dataset\n",
+ "\n",
+ "In this section, we prepare a dataset for evaluating our Chain of Density (CoD) summarization pipeline on ArXiv papers. This dataset will allow us to assess the performance of our model across different papers and instructions.\n",
+ "\n",
+ "### Key Components:\n",
+ "\n",
+ "1. **Sample ArXiv Papers**: We create `ArxivPaper` objects for three different papers:\n",
+ " - `arxiv_paper1`: \"Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?\"\n",
+ " - `arxiv_paper2`: \"Many-Shot In-Context Learning\"\n",
+ " - `arxiv_paper3`: \"LLMs instead of Human Judges? A Large Scale Empirical Study across 20 NLP Evaluation Tasks\"\n",
+ "\n",
+ " Each `ArxivPaper` object contains metadata such as title, authors, summary, and PDF URL.\n",
+ "\n",
+ "2. **Evaluation Instructions**: We define a list of instructions that will guide the summarization process:\n",
+ " ```python\n",
+ " eval_instructions = [\n",
+ " \"Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field.\",\n",
+ " \"Analyze the experimental setup, results, and limitations of this study, highlighting any statistical significance and error margins.\",\n",
+ " \"Compare this paper's approach to existing methods in the field, explaining how it addresses current challenges or limitations.\"\n",
+ " ]\n",
+ " ```\n",
+ "\n",
+ "3. **Creating Evaluation Data**: We use `itertools.product()` to create combinations of papers and instructions:\n",
+ " ```python\n",
+ " eval_data = list(product(eval_papers, eval_instructions))\n",
+ " ```\n",
+ "\n",
+ "4. **Weave Dataset**: Finally, we create a Weave Dataset object that combines the paper, instruction, and original summary for each evaluation item:\n",
+ " ```python\n",
+ " dataset = weave.Dataset(name=\"we-paper-reading-eval-data\",\n",
+ " rows=[{\"paper\": arxiv_paper,\n",
+ " \"instruction\": instruction,\n",
+ " \"summary\": arxiv_paper.summary}\n",
+ " for arxiv_paper, instruction in eval_data])\n",
+ " ```\n",
+ "\n",
+ "5. **Publishing the Dataset**: We publish the dataset to make it available for evaluation:\n",
+ " ```python\n",
+ " weave.publish(dataset)\n",
+ " ```\n",
+ "\n",
+ "This evaluation dataset provides a structured way to assess our CoD summarization pipeline across different papers and instructions, allowing for comprehensive testing of the model's performance and adaptability."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {
+ "id": "hXvuctKJdPeJ"
+ },
+ "outputs": [],
+ "source": [
+ "arxiv_paper1 = ArxivPaper(\n",
+ " entry_id=\"http://arxiv.org/abs/2405.05904\",\n",
+ " updated=datetime(2024, 5, 13, 7, 29, 58, tzinfo=timezone.utc),\n",
+ " published=datetime(2024, 5, 9, 17, 0, 22, tzinfo=timezone.utc),\n",
+ " title=\"Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?\",\n",
+ " authors=[\n",
+ " Author(full_name=\"Zorik Gekhman\"),\n",
+ " Author(full_name=\"Gal Yona\"),\n",
+ " Author(full_name=\"Roee Aharoni\"),\n",
+ " Author(full_name=\"Matan Eyal\"),\n",
+ " Author(full_name=\"Amir Feder\"),\n",
+ " Author(full_name=\"Roi Reichart\"),\n",
+ " Author(full_name=\"Jonathan Herzig\")\n",
+ " ],\n",
+ " summary=(\"When large language models are aligned via supervised fine-tuning, they may encounter new factual information \"\n",
+ " \"that was not acquired through pre-training. It is often conjectured that this can teach the model the behavior \"\n",
+ " \"of hallucinating factually incorrect responses, as the model is trained to generate facts that are not grounded \"\n",
+ " \"in its pre-existing knowledge. In this work, we study the impact of such exposure to new knowledge on the capability \"\n",
+ " \"of the fine-tuned model to utilize its pre-existing knowledge. To this end, we design a controlled setup, focused on \"\n",
+ " \"closed-book QA, where we vary the proportion of the fine-tuning examples that introduce new knowledge. We demonstrate \"\n",
+ " \"that large language models struggle to acquire new factual knowledge through fine-tuning, as fine-tuning examples that \"\n",
+ " \"introduce new knowledge are learned significantly slower than those consistent with the model's knowledge. However, we \"\n",
+ " \"also find that as the examples with new knowledge are eventually learned, they linearly increase the model's tendency \"\n",
+ " \"to hallucinate. Taken together, our results highlight the risk in introducing new factual knowledge through fine-tuning, \"\n",
+ " \"and support the view that large language models mostly acquire factual knowledge through pre-training, whereas fine-tuning \"\n",
+ " \"teaches them to use it more efficiently.\"),\n",
+ " comment=None,\n",
+ " journal_ref=None,\n",
+ " doi=\"10.48550/arXiv.2405.05904\",\n",
+ " primary_category=\"cs.CL\",\n",
+ " categories=[\"cs.CL\"],\n",
+ " links=[\n",
+ " Link(href=\"https://arxiv.org/abs/2405.05904\", title=\"Abstract\", rel=\"alternate\"),\n",
+ " Link(href=\"https://arxiv.org/pdf/2405.05904\", title=\"pdf\", rel=\"related\")\n",
+ " ],\n",
+ " pdf_url=\"https://arxiv.org/pdf/2405.05904\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {
+ "id": "KcvBpUjjdPeJ"
+ },
+ "outputs": [],
+ "source": [
+ "arxiv_paper2 = ArxivPaper(\n",
+ " entry_id=\"http://arxiv.org/abs/2404.11018\",\n",
+ " updated=datetime(2024, 5, 22, 17, 6, 10, tzinfo=timezone.utc),\n",
+ " published=datetime(2024, 4, 17, 2, 49, 26, tzinfo=timezone.utc),\n",
+ " title=\"Many-Shot In-Context Learning\",\n",
+ " authors=[\n",
+ " Author(full_name=\"Rishabh Agarwal\"),\n",
+ " Author(full_name=\"Avi Singh\"),\n",
+ " Author(full_name=\"Lei M. Zhang\"),\n",
+ " Author(full_name=\"Bernd Bohnet\"),\n",
+ " Author(full_name=\"Luis Rosias\"),\n",
+ " Author(full_name=\"Stephanie Chan\"),\n",
+ " Author(full_name=\"Biao Zhang\"),\n",
+ " Author(full_name=\"Ankesh Anand\"),\n",
+ " Author(full_name=\"Zaheer Abbas\"),\n",
+ " Author(full_name=\"Azade Nova\"),\n",
+ " Author(full_name=\"John D. Co-Reyes\"),\n",
+ " Author(full_name=\"Eric Chu\"),\n",
+ " Author(full_name=\"Feryal Behbahani\"),\n",
+ " Author(full_name=\"Aleksandra Faust\"),\n",
+ " Author(full_name=\"Hugo Larochelle\")\n",
+ " ],\n",
+ " summary=(\"Large language models (LLMs) excel at few-shot in-context learning (ICL) -- learning from a few examples provided in context at inference, \"\n",
+ " \"without any weight updates. Newly expanded context windows allow us to investigate ICL with hundreds or thousands of examples -- the many-shot regime. \"\n",
+ " \"Going from few-shot to many-shot, we observe significant performance gains across a wide variety of generative and discriminative tasks. While promising, \"\n",
+ " \"many-shot ICL can be bottlenecked by the available amount of human-generated examples. To mitigate this limitation, we explore two new settings: Reinforced \"\n",
+ " \"and Unsupervised ICL. Reinforced ICL uses model-generated chain-of-thought rationales in place of human examples. Unsupervised ICL removes rationales from the \"\n",
+ " \"prompt altogether, and prompts the model only with domain-specific questions. We find that both Reinforced and Unsupervised ICL can be quite effective in the \"\n",
+ " \"many-shot regime, particularly on complex reasoning tasks. Finally, we demonstrate that, unlike few-shot learning, many-shot learning is effective at overriding \"\n",
+ " \"pretraining biases, can learn high-dimensional functions with numerical inputs, and performs comparably to fine-tuning. Our analysis also reveals the limitations \"\n",
+ " \"of next-token prediction loss as an indicator of downstream ICL performance.\"),\n",
+ " comment=None,\n",
+ " journal_ref=None,\n",
+ " doi=\"10.48550/arXiv.2404.11018\",\n",
+ " primary_category=\"cs.LG\",\n",
+ " categories=[\"cs.LG\", \"cs.AI\", \"cs.CL\"],\n",
+ " links=[\n",
+ " Link(href=\"https://arxiv.org/abs/2404.11018\", title=\"Abstract\", rel=\"alternate\"),\n",
+ " Link(href=\"https://arxiv.org/pdf/2404.11018\", title=\"pdf\", rel=\"related\")\n",
+ " ],\n",
+ " pdf_url=\"https://arxiv.org/pdf/2404.11018\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "id": "3PKYkvhTdPeK"
+ },
+ "outputs": [],
+ "source": [
+ "arxiv_paper3 = ArxivPaper(\n",
+ " entry_id=\"http://arxiv.org/abs/2406.18403\",\n",
+ " updated=datetime(2024, 6, 26, 14, 56, 13, tzinfo=timezone.utc),\n",
+ " published=datetime(2024, 6, 26, 14, 56, 13, tzinfo=timezone.utc),\n",
+ " title=\"LLMs instead of Human Judges? A Large Scale Empirical Study across 20 NLP Evaluation Tasks\",\n",
+ " authors=[\n",
+ " Author(full_name=\"Anna Bavaresco\"),\n",
+ " Author(full_name=\"Raffaella Bernardi\"),\n",
+ " Author(full_name=\"Leonardo Bertolazzi\"),\n",
+ " Author(full_name=\"Desmond Elliott\"),\n",
+ " Author(full_name=\"Raquel Fernández\"),\n",
+ " Author(full_name=\"Albert Gatt\"),\n",
+ " Author(full_name=\"Esam Ghaleb\"),\n",
+ " Author(full_name=\"Mario Giulianelli\"),\n",
+ " Author(full_name=\"Michael Hanna\"),\n",
+ " Author(full_name=\"Alexander Koller\"),\n",
+ " Author(full_name=\"André F. T. Martins\"),\n",
+ " Author(full_name=\"Philipp Mondorf\"),\n",
+ " Author(full_name=\"Vera Neplenbroek\"),\n",
+ " Author(full_name=\"Sandro Pezzelle\"),\n",
+ " Author(full_name=\"Barbara Plank\"),\n",
+ " Author(full_name=\"David Schlangen\"),\n",
+ " Author(full_name=\"Alessandro Suglia\"),\n",
+ " Author(full_name=\"Aditya K Surikuchi\"),\n",
+ " Author(full_name=\"Ece Takmaz\"),\n",
+ " Author(full_name=\"Alberto Testoni\")\n",
+ " ],\n",
+ " summary=(\"There is an increasing trend towards evaluating NLP models with LLM-generated judgments instead of human judgments. \"\n",
+ " \"In the absence of a comparison against human data, this raises concerns about the validity of these evaluations; in case they are conducted with proprietary models, \"\n",
+ " \"this also raises concerns over reproducibility. We provide JUDGE-BENCH, a collection of 20 NLP datasets with human annotations, and comprehensively evaluate 11 current LLMs, \"\n",
+ " \"covering both open-weight and proprietary models, for their ability to replicate the annotations. Our evaluations show that each LLM exhibits a large variance across datasets in its correlation to human judgments. \"\n",
+ " \"We conclude that LLMs are not yet ready to systematically replace human judges in NLP.\"),\n",
+ " comment=None,\n",
+ " journal_ref=None,\n",
+ " doi=\"10.48550/arXiv.2406.18403\",\n",
+ " primary_category=\"cs.CL\",\n",
+ " categories=[\"cs.CL\"],\n",
+ " links=[\n",
+ " Link(href=\"https://arxiv.org/abs/2406.18403\", title=\"Abstract\", rel=\"alternate\"),\n",
+ " Link(href=\"https://arxiv.org/pdf/2406.18403\", title=\"pdf\", rel=\"related\")\n",
+ " ],\n",
+ " pdf_url=\"https://arxiv.org/pdf/2406.18403\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 36
+ },
+ "id": "Ru9leaQPdPeK",
+ "outputId": "6c8f8da4-2fc5-44ad-a006-d9258c6b0831"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'https://arxiv.org/pdf/2406.18403'"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "arxiv_paper3.pdf_url"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "id": "CvURkKmIdPeK"
+ },
+ "outputs": [],
+ "source": [
+ "eval_papers = [\n",
+ " arxiv_paper1,\n",
+ " arxiv_paper2,\n",
+ " arxiv_paper3\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "id": "CLjr3xshdPeK"
+ },
+ "outputs": [],
+ "source": [
+ "eval_instructions = [\n",
+ " \"Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field.\",\n",
+ " \"Analyze the experimental setup, results, and limitations of this study, highlighting any statistical significance and error margins.\",\n",
+ " \"Compare this paper's approach to existing methods in the field, explaining how it addresses current challenges or limitations.\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "id": "lxr_icGedPeL"
+ },
+ "outputs": [],
+ "source": [
+ "eval_data = list(product(eval_papers, eval_instructions))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {
+ "id": "vQD6m4NIdPeL"
+ },
+ "outputs": [],
+ "source": [
+ "dataset = weave.Dataset(name=\"we-paper-reading-eval-data\", rows=[{\"paper\": arxiv_paper, \"instruction\": instruction, \"summary\": arxiv_paper.summary} for arxiv_paper, instruction in eval_data])\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "l7k2M3yddPeL",
+ "outputId": "2c0aa58e-6ac3-4086-a39a-5602efee7a75"
+ },
+ "outputs": [],
+ "source": [
+ "weave.publish(dataset)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sIZqvYlYdPeL"
+ },
+ "source": [
+ "## Define our metrics\n",
+ "\n",
+ "In this section, we establish a set of metrics to evaluate the quality and effectiveness of our Chain of Density (CoD) summarization pipeline for ArXiv PDFs. These metrics are designed to provide a comprehensive assessment of the summarization process, focusing on relevance, technical quality, and conciseness.\n",
+ "\n",
+ "### Key Metrics:\n",
+ "\n",
+ "1. **Summary Scoring (`score_summary`)**:\n",
+ " - Evaluates individual summaries based on three criteria:\n",
+ " - Relevance (0-5): How well the summary addresses the given instruction\n",
+ " - Technical Quality (0-5): Accuracy and depth of technical content\n",
+ " - Conciseness (0-5): Information density and brevity\n",
+ " - Uses GPT-4 to perform the evaluation, ensuring a nuanced assessment\n",
+ "\n",
+ "2. **Long-tail Statistics (`calculate_long_tail_stats`)**:\n",
+ " - Analyzes the distribution of scores across multiple summaries\n",
+ " - Calculates mean scores and tail ratios for each aspect (relevance, technical quality, conciseness)\n",
+ " - Helps identify overall performance and potential outliers\n",
+ "\n",
+ "3. **Iteration Impact Analysis (`analyze_iteration_impact`)**:\n",
+ " - Assesses the improvement of summaries across iterations\n",
+ " - Identifies the point of diminishing returns and cumulative improvement\n",
+ " - Useful for optimizing the number of iterations in the CoD process\n",
+ "\n",
+ "4. **Optimal Improvement Range (`find_optimal_improvement_range`)**:\n",
+ " - Determines the most effective range of iterations for improvement\n",
+ " - Considers moving averages of improvements to find sustained progress\n",
+ "\n",
+ "5. **Optimal Score Range (`find_optimal_score_range`)**:\n",
+ " - Identifies the iteration range that produces the highest quality summaries\n",
+ " - Helps in fine-tuning the CoD process for maximum effectiveness\n",
+ "\n",
+ "6. **Iteration Summary Processing (`process_iteration_summaries`)**:\n",
+ " - Aggregates and analyzes scores across all iterations\n",
+ " - Provides a holistic view of the summarization process's progression\n",
+ "\n",
+ "7. **Quality Scorer (`quality_scorer`)**:\n",
+ " - Combines all the above metrics into a comprehensive evaluation\n",
+ " - Analyzes iteration summaries, accumulated summary, and final summary\n",
+ " - Produces a flattened, easy-to-analyze score dictionary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {
+ "id": "yCplXunZdPeM"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def score_summary(summary, summary_type, instruction, model):\n",
+ " openai_client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
+ " prompt = f\"\"\"Evaluate the quality of the following {summary_type} based on how well it addresses the given instruction. Use the scoring rules below to calculate three numerical scores between 0 and 10.\n",
+ "\n",
+ "Instruction: {instruction}\n",
+ "\n",
+ "{summary_type}:\n",
+ "{summary}\n",
+ "Scoring Rules:\n",
+ "1. Relevance (0-5):\n",
+ " - 5: Perfectly addresses all aspects of the instruction, focusing on key methodologies and novel contributions\n",
+ " Example: \"The paper introduces JUDGE-BENCH, a comprehensive evaluation framework comprising 20 NLP datasets with human annotations, designed to assess LLMs' capacity to replicate human judgments across diverse NLP tasks. The study employs a rigorous comparative analysis of 11 state-of-the-art LLMs, including both open-weight and proprietary models, utilizing correlation metrics to quantify alignment with human annotations.\"\n",
+ " - 4: Addresses most aspects of the instruction with minor omissions\n",
+ " Example: \"The research presents JUDGE-BENCH, a novel evaluation framework for LLMs consisting of 20 NLP datasets. It conducts a thorough assessment of 11 LLMs, analyzing their ability to replicate human judgments. The methodology involves correlation analysis between LLM outputs and human annotations.\"\n",
+ " - 3: Addresses the main points of the instruction but misses some details about methodologies or contributions\n",
+ " Example: \"The study proposes JUDGE-BENCH, a new benchmark for evaluating LLMs against human judgments in NLP tasks. It assesses multiple LLMs and finds significant variability in their performance across different datasets.\"\n",
+ " - 2: Partially addresses the instruction, missing significant aspects of methodologies or contributions\n",
+ " Example: \"The paper discusses a new method for evaluating language models using human-annotated datasets. It compares several LLMs and concludes that they are not yet ready to replace human judges in NLP tasks.\"\n",
+ " - 1: Barely addresses the instruction, focusing on tangential information\n",
+ " Example: \"The research explores various natural language processing tasks and the performance of language models. It suggests that human evaluation is still important in NLP.\"\n",
+ " - 0: Completely irrelevant to the instruction\n",
+ " Example: \"The paper discusses advancements in computer vision algorithms for image recognition using convolutional neural networks.\"\n",
+ "\n",
+ "2. Technical Quality (0-5):\n",
+ " - 5: Exceptionally accurate, detailed, and technically sound, with precise descriptions of methodologies and contributions\n",
+ " Example: \"JUDGE-BENCH employs a multi-faceted evaluation protocol, utilizing Pearson correlation coefficients (r) to quantify LLM-human judgment alignment across 20 diverse NLP tasks. The framework incorporates both discriminative and generative tasks, with a particular focus on nuanced linguistic phenomena such as pragmatic inference and discourse coherence. The study reports a mean correlation of r = 0.47 (σ = 0.18) across all models and tasks, with significant inter-task variability (range: 0.12 ≤ r ≤ 0.83). Notably, the best-performing LLM (GPT-4) achieved a maximum mean correlation of r = 0.62, still substantially below perfect alignment (r = 1.0), underscoring the persistent gap between LLM and human judgment capabilities.\"\n",
+ " - 4: Highly accurate with comprehensive technical details about research methods and findings\n",
+ " Example: \"The JUDGE-BENCH framework evaluates 11 LLMs across 20 NLP datasets using Pearson correlation to measure alignment with human judgments. The study reports a mean correlation of 0.47 across all models and tasks, with significant variability (σ = 0.18). The best-performing model (GPT-4) achieved a maximum mean correlation of 0.62, indicating a substantial gap between LLM and human judgment capabilities.\"\n",
+ " - 3: Generally accurate with good technical depth, but may lack some specifics\n",
+ " Example: \"JUDGE-BENCH evaluates LLMs using correlation analysis with human judgments across multiple NLP tasks. The study finds variable performance across models and tasks, with the best model achieving a mean correlation of 0.62. This suggests LLMs are not yet capable of consistently replicating human judgments in NLP tasks.\"\n",
+ " - 2: Mostly accurate but lacks important technical details about methodologies or contributions\n",
+ " Example: \"The study uses a new benchmark called JUDGE-BENCH to evaluate language models. It compares LLM outputs to human judgments using correlation analysis and finds that even the best models don't consistently match human performance across different NLP tasks.\"\n",
+ " - 1: Contains technical inaccuracies or lacks significant depth in describing research approaches\n",
+ " Example: \"The paper discusses a method for evaluating AI language models using human-annotated datasets. It shows that AI models don't always agree with human judgments, suggesting they need improvement.\"\n",
+ " - 0: Technically unsound or extremely superficial in describing methodologies and contributions\n",
+ " Example: \"The research uses AI to compare computer-generated text to human writing. It finds that AI is not as good as humans at understanding language.\"\n",
+ "\n",
+ "3. Conciseness (0-5):\n",
+ " - 5: Maximally information-dense without any unnecessary content, perfectly balancing detail and brevity\n",
+ " Example: \"JUDGE-BENCH: 20-dataset NLP evaluation framework. 11 LLMs assessed. Mean correlation with human judgments: r = 0.47 (σ = 0.18). Best model (GPT-4): r = 0.62. Significant inter-task variability: 0.12 ≤ r ≤ 0.83. Conclusion: LLMs not ready to replace human judges in NLP.\"\n",
+ " - 4: Highly concise with minimal extraneous information, efficiently describing methodologies and contributions\n",
+ " Example: \"JUDGE-BENCH: 20 NLP datasets for LLM evaluation. 11 models tested. Mean human-LLM judgment correlation: 0.47. Best model: 0.62. High variability across tasks. LLMs currently inadequate for replacing human NLP judges.\"\n",
+ " - 3: Generally concise but could be slightly more compact in describing research approaches\n",
+ " Example: \"JUDGE-BENCH evaluates 11 LLMs on 20 NLP datasets. Uses correlation with human judgments. Finds variable performance across tasks. Best model achieves 0.62 correlation. Concludes LLMs can't reliably replace human judges in NLP yet.\"\n",
+ " - 2: Contains some unnecessary information or repetition, diluting the focus on key methodologies and contributions\n",
+ " Example: \"The paper introduces JUDGE-BENCH, a new way to evaluate language models. It looks at how well 11 different AI models can match human judgments on 20 NLP tasks. The researchers found that even the best AI model wasn't consistently as good as humans at judging language tasks. They conclude that AI models aren't ready to replace human judges in NLP research yet.\"\n",
+ " - 1: Verbose with significant redundancy, obscuring the main research points\n",
+ " Example: \"In this study, the researchers created something called JUDGE-BENCH. It's a way to test how good AI language models are at understanding and judging language like humans do. They tested 11 different AI models on 20 different types of language tasks. They found out that the AI models weren't as consistent as humans in judging these tasks. Even the best AI model wasn't always as good as humans. So, they say that right now, we can't use AI to replace humans when we need to judge language in research.\"\n",
+ " - 0: Extremely verbose or filled with irrelevant information unrelated to methodologies and contributions\n",
+ " Example: \"The researchers in this study were interested in natural language processing, which is a field of artificial intelligence that deals with how computers understand and generate human language. They created a new tool called JUDGE-BENCH to test AI models. They used many different language tasks and compared how the AI did compared to humans. It's important to test AI models because we want to know if they can understand language as well as humans can. This kind of research helps us improve AI technology.\"\n",
+ "\n",
+ " Examples:\n",
+ "\n",
+ "1. High-quality summary (Instruction: \"Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field.\"):\n",
+ "{{\n",
+ " \"relevance\": {{\n",
+ " \"score\": 4.75\n",
+ " }},\n",
+ " \"technical_quality\": {{\n",
+ " \"score\": 4.5\n",
+ " }},\n",
+ " \"conciseness\": {{\n",
+ " \"score\": 4.25\n",
+ " }}\n",
+ "}}\n",
+ "\n",
+ "2. Average-quality summary (Instruction: \"Analyze the experimental setup, results, and limitations of this study.\"):\n",
+ "{{\n",
+ " \"relevance\": {{\n",
+ " \"score\": 3.0\n",
+ " }},\n",
+ " \"technical_quality\": {{\n",
+ " \"score\": 2.75\n",
+ " }},\n",
+ " \"conciseness\": {{\n",
+ " \"score\": 3.5\n",
+ " }}\n",
+ "}}\n",
+ "\n",
+ "3. Low-quality summary (Instruction: \"Explain how this paper's approach compares to existing methods in the field.\"):\n",
+ "{{\n",
+ " \"relevance\": {{\n",
+ " \"score\": 1.5\n",
+ " }},\n",
+ " \"technical_quality\": {{\n",
+ " \"score\": 1.25\n",
+ " }},\n",
+ " \"conciseness\": {{\n",
+ " \"score\": 2.0\n",
+ " }}\n",
+ "}}\n",
+ "\n",
+ "Provide your evaluation in the following JSON format:\n",
+ "{{\n",
+ " \"relevance\": {{\n",
+ " \"score\": \n",
+ " }},\n",
+ " \"technical_quality\": {{\n",
+ " \"score\": \n",
+ " }},\n",
+ " \"conciseness\": {{\n",
+ " \"score\": \n",
+ " }}\n",
+ "}}\n",
+ "\n",
+ "Ensure your response is ONLY valid JSON. Do not include any other text outside the JSON object.\n",
+ "Ensure you have the keys: relevance, technical_quality, conciseness, each containing only a score.\n",
+ "Ensure each score is a float between 0 and 10, using the scoring rules provided above.\n",
+ "\"\"\"\n",
+ "\n",
+ " response = openai_client.chat.completions.create(\n",
+ " model=model,\n",
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
+ " response_format={\"type\": \"json_object\"}\n",
+ " )\n",
+ " return json.loads(response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {
+ "id": "AHi94h1wdPeM"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def calculate_long_tail_stats(scores):\n",
+ " if not scores:\n",
+ " return None\n",
+ " aspects = ['relevance', 'technical_quality', 'conciseness']\n",
+ " stats = {}\n",
+ " for aspect in aspects:\n",
+ " try:\n",
+ " if isinstance(scores[0], list):\n",
+ " flattened_scores = [score[aspect]['score'] for sublist in scores for score in sublist]\n",
+ " elif isinstance(scores[0], dict):\n",
+ " flattened_scores = [score[aspect]['score'] for score in scores]\n",
+ " else:\n",
+ " print(f\"Unexpected format for scores: {scores}\")\n",
+ " return None\n",
+ "\n",
+ " stats[aspect] = {\n",
+ " \"mean\": np.mean(flattened_scores),\n",
+ " # \"median\": np.median(flattened_scores),\n",
+ " # \"top_5_percent\": np.mean(sorted(flattened_scores)[-max(1, int(len(flattened_scores)*0.05)):]),\n",
+ " # \"bottom_5_percent\": np.mean(sorted(flattened_scores)[:max(1, int(len(flattened_scores)*0.05))]),\n",
+ " # \"top_1_percent\": np.mean(sorted(flattened_scores)[-max(1, int(len(flattened_scores)*0.01)):]),\n",
+ " # \"interquartile_range\": np.percentile(flattened_scores, 75) - np.percentile(flattened_scores, 25),\n",
+ " \"tail_ratio\": np.mean(sorted(flattened_scores)[-max(1, int(len(flattened_scores)*0.05)):]) / np.mean(flattened_scores),\n",
+ " }\n",
+ " except Exception as e:\n",
+ " print(f\"Error calculating stats for {aspect}: {str(e)}\")\n",
+ " stats[aspect] = None\n",
+ " return stats"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {
+ "id": "PRBwiXyCdPeM"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def analyze_iteration_impact(scores):\n",
+ " if len(scores) < 2:\n",
+ " return {aspect: {\"mean_improvement\": 0, \"diminishing_returns_point\": 0, \"cumulative_improvement\": 0, \"improvement_variability\": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}\n",
+ "\n",
+ " aspects = ['relevance', 'technical_quality', 'conciseness']\n",
+ " results = {}\n",
+ "\n",
+ " for aspect in aspects:\n",
+ " aspect_scores = [s[aspect]['score'] for s in scores]\n",
+ " improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]\n",
+ "\n",
+ " results[aspect] = {\n",
+ " # \"mean_improvement\": np.mean(improvements),\n",
+ " \"diminishing_returns_point\": next((i for i, imp in enumerate(improvements) if imp <= 0), len(improvements)),\n",
+ " \"cumulative_improvement\": sum(improvements),\n",
+ " # \"improvement_variability\": np.std(improvements) / np.mean(improvements) if np.mean(improvements) != 0 else 0\n",
+ " }\n",
+ "\n",
+ " return results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {
+ "id": "bAKG5hfBdPeM"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def find_optimal_improvement_range(scores):\n",
+ " if len(scores) < 3:\n",
+ " return {aspect: {\"optimal_range_start\": 0, \"optimal_range_end\": 0, \"score_at_start\": 0, \"score_at_end\": 0, \"improvement_in_range\": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}\n",
+ "\n",
+ " aspects = ['relevance', 'technical_quality', 'conciseness']\n",
+ " results = {}\n",
+ "\n",
+ " for aspect in aspects:\n",
+ " aspect_scores = [s[aspect]['score'] for s in scores]\n",
+ " improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]\n",
+ "\n",
+ " window_size = min(3, len(aspect_scores) - 1)\n",
+ " moving_avg = np.convolve(improvements, np.ones(window_size), 'valid') / window_size\n",
+ "\n",
+ " threshold = 0.1 * np.mean(improvements)\n",
+ " above_threshold = [i for i, avg in enumerate(moving_avg) if avg >= threshold]\n",
+ "\n",
+ " if not above_threshold:\n",
+ " optimal_start, optimal_end = 0, 0\n",
+ " else:\n",
+ " optimal_start = above_threshold[0]\n",
+ " optimal_end = above_threshold[-1] + 1\n",
+ "\n",
+ " results[aspect] = {\n",
+ " \"optimal_range_start\": optimal_start,\n",
+ " \"optimal_range_end\": optimal_end,\n",
+ " \"score_at_start\": aspect_scores[optimal_start],\n",
+ " \"score_at_end\": aspect_scores[optimal_end] if optimal_end < len(aspect_scores) else aspect_scores[-1],\n",
+ " \"improvement_in_range\": sum(improvements[optimal_start:optimal_end])\n",
+ " }\n",
+ "\n",
+ " return results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {
+ "id": "YJ36vbiidPeM"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def find_optimal_score_range(scores):\n",
+ " if len(scores) < 2:\n",
+ " return {aspect: {\"optimal_range_start\": 0, \"optimal_range_end\": 0, \"highest_score\": 0, \"improvement_in_range\": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}\n",
+ "\n",
+ " aspects = ['relevance', 'technical_quality', 'conciseness']\n",
+ " results = {}\n",
+ "\n",
+ " for aspect in aspects:\n",
+ " aspect_scores = [s[aspect]['score'] for s in scores]\n",
+ " improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]\n",
+ "\n",
+ " highest_score = max(aspect_scores)\n",
+ " highest_score_index = aspect_scores.index(highest_score)\n",
+ "\n",
+ " best_start = 0\n",
+ " best_end = highest_score_index\n",
+ " best_improvement = sum(improvements[:highest_score_index])\n",
+ "\n",
+ " for start in range(highest_score_index):\n",
+ " current_improvement = sum(improvements[start:highest_score_index])\n",
+ " if current_improvement > best_improvement:\n",
+ " best_start = start\n",
+ " best_improvement = current_improvement\n",
+ "\n",
+ " results[aspect] = {\n",
+ " \"optimal_range_start\": best_start,\n",
+ " \"optimal_range_end\": highest_score_index,\n",
+ " \"score_at_start\": aspect_scores[best_start],\n",
+ " \"score_at_end\": highest_score,\n",
+ " \"improvement_in_range\": best_improvement\n",
+ " }\n",
+ "\n",
+ " return results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {
+ "id": "ow54Kn7pdPeM"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def process_iteration_summaries(model_output, instruction, model):\n",
+ " iteration_scores = [score_summary(summary, f\"Iteration Summary {i+1}\", instruction, model)\n",
+ " for i, summary in enumerate(model_output[\"iteration_summaries\"])]\n",
+ " return {\n",
+ " \"long_tail_stats\": calculate_long_tail_stats(iteration_scores),\n",
+ " # \"iteration_impact\": analyze_iteration_impact(iteration_scores),\n",
+ " # \"optimal_improvement_range\": find_optimal_improvement_range(iteration_scores),\n",
+ " # \"optimal_score_range\": find_optimal_score_range(iteration_scores)\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {
+ "id": "TtBFeqQLdPeN"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def quality_scorer(instruction, model_output, model=\"gpt-4o\"):\n",
+ " scores = {\n",
+ " \"iteration_summaries_analysis\": {},\n",
+ " \"accumulated_summary\": {},\n",
+ " \"final_summary\": {}\n",
+ " }\n",
+ "\n",
+ " try:\n",
+ "\n",
+ " # Process iteration summaries\n",
+ " scores[\"iteration_summaries_analysis\"] = process_iteration_summaries(model_output, instruction, model)\n",
+ "\n",
+ " # Score accumulated summary\n",
+ " scores[\"accumulated_summary\"] = score_summary(model_output[\"accumulated_summary\"], \"Accumulated Summary\", instruction, model)\n",
+ "\n",
+ " # Score final summary\n",
+ " scores[\"final_summary\"] = score_summary(model_output[\"final_summary\"], \"Final Summary\", instruction, model)\n",
+ "\n",
+ " # After calculating all scores\n",
+ " flattened_scores = {}\n",
+ " for key, value in scores.items():\n",
+ " if isinstance(value, dict):\n",
+ " flattened_scores[key] = flatten_dict(value)\n",
+ " else:\n",
+ " flattened_scores[key] = value\n",
+ "\n",
+ " scores = flatten_dict(flattened_scores)\n",
+ "\n",
+ " except Exception as e:\n",
+ " print(f\"Error in quality_scorer: {str(e)}\")\n",
+ " scores[\"error\"] = str(e)\n",
+ "\n",
+ " return scores"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fHqaeMT6dPeN"
+ },
+ "source": [
+ "Here's a markdown description for the \"Run Evaluation!\" section:\n",
+ "\n",
+ "## Run Evaluation!\n",
+ "\n",
+ "In this section, we demonstrate how to run the evaluation of our Chain of Density (CoD) summarization pipeline on ArXiv papers. This process involves using multiple models and assessing their performance using our custom evaluation metrics.\n",
+ "\n",
+ "1. First, we define a list of models to evaluate:\n",
+ "These models represent different versions of Claude, allowing us to compare their performance on our summarization task.\n",
+ "\n",
+ "2. Next, we set up and run the evaluation:\n",
+ "\n",
+ "Here's what's happening in this code:\n",
+ "\n",
+ "- We create a `weave.Evaluation` object, using our previously defined dataset and the `quality_scorer` function.\n",
+ "- We iterate through each model in our list.\n",
+ "- For each model, we create an `ArxivChainOfDensityPipeline` instance, specifying the model and setting `density_iterations` to 8.\n",
+ "- We then run the evaluation asynchronously using `await evaluation.evaluate()`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {
+ "id": "WNcyrqJxdPeN"
+ },
+ "outputs": [],
+ "source": [
+ "models = [\n",
+ " \"claude-3-opus-20240229\",\n",
+ " \"claude-3-haiku-20240307\",\n",
+ " \"claude-3-5-sonnet-20240620\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "vOXMQrPTdPeN",
+ "outputId": "ea03ee68-41f5-4308-9528-28bf689791f1"
+ },
+ "outputs": [],
+ "source": [
+ "evaluation = weave.Evaluation(dataset=dataset, scorers=[quality_scorer])\n",
+ "for model in models:\n",
+ " arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline(model=model, density_iterations=8)\n",
+ " await evaluation.evaluate(arxiv_chain_of_density_pipeline)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "YxAlLV1fdPeN"
+ },
+ "source": [
+ "## Optional: Deeper Analysis of Summary Refinement using Chunking\n",
+ "\n",
+ "This section introduces an advanced technique to enhance the Chain of Density (CoD) summarization process for ArXiv PDFs. By incorporating a chunking mechanism, we can handle longer documents more effectively and potentially improve the quality of our summaries. Here's an overview of the augmented CoD summarization process:\n",
+ "\n",
+ "### Augmented Chain of Density Summarization\n",
+ "\n",
+ "1. **Chunk the Text**: We split the input text into manageable chunks using the `chunk_text` function. This function intelligently handles document structure, including image descriptions.\n",
+ "\n",
+ "2. **Iterative Chunk Summarization**: For each chunk, we apply the CoD process using `summarize_chunk`. This creates summaries for individual sections of the document.\n",
+ "\n",
+ "3. **Combine Chunk Summaries**: We use `summarize_chunk_summaries` to integrate information from all chunk summaries into a cohesive whole.\n",
+ "\n",
+ "4. **Iterative Refinement**: We repeat steps 2-3 for a specified number of iterations (`chunk_iterations`) to progressively refine the summary.\n",
+ "\n",
+ "5. **Final Density Pass**: After chunk-based summarization, we apply the standard CoD process to further refine and densify the final summary.\n",
+ "\n",
+ "Key components of this process include:\n",
+ "\n",
+ "- `chunk_text`: Splits the document into manageable pieces.\n",
+ "- `summarize_chunk`: Applies CoD to individual chunks.\n",
+ "- `summarize_chunk_summaries`: Combines chunk summaries.\n",
+ "- `summarize_chunk_iteration`: Manages the iteration process for chunk summarization.\n",
+ "- `iterative_chunk_summarization`: Orchestrates the entire chunk-based summarization process.\n",
+ "- `chain_of_density_summarization`: Integrates chunking with the final CoD refinement.\n",
+ "\n",
+ "This augmented approach allows us to:\n",
+ "1. Handle longer documents more effectively.\n",
+ "2. Potentially capture more nuanced information from different parts of the paper.\n",
+ "3. Provide a more comprehensive summary that considers the entire document structure.\n",
+ "\n",
+ "The `ArxivChainOfDensityPipeline` class has been updated to incorporate these new features, allowing for easy experimentation with different chunk sizes and iteration counts.\n",
+ "\n",
+ "To evaluate the effectiveness of this approach, we've also extended our `quality_scorer` function to analyze the chunk-based summaries separately. This gives us insights into how the chunking process affects summary quality at different stages of the pipeline.\n",
+ "\n",
+ "By using this augmented CoD approach, we aim to create more comprehensive and accurate summaries of ArXiv papers, especially for longer or more complex documents."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {
+ "id": "1YOyfn4_dPeN"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def chunk_text(text, chunk_size):\n",
+ " chunks = []\n",
+ " current_chunk = \"\"\n",
+ " lines = text.split('\\n')\n",
+ "\n",
+ " i = 0\n",
+ " while i < len(lines):\n",
+ " line = lines[i]\n",
+ " if len(current_chunk) + len(line) > chunk_size:\n",
+ " if current_chunk:\n",
+ " chunks.append(current_chunk.strip())\n",
+ " current_chunk = \"\"\n",
+ "\n",
+ " current_chunk += line + \"\\n\"\n",
+ "\n",
+ " if line.startswith(\"[Image Descriptions for page\"):\n",
+ " if current_chunk.strip():\n",
+ " chunks.append(current_chunk.strip())\n",
+ " current_chunk = \"\"\n",
+ "\n",
+ " image_descriptions = line + \"\\n\"\n",
+ " i += 1\n",
+ " while i < len(lines) and not lines[i].startswith(\"[END OF IMAGE DESCRIPTIONS]\"):\n",
+ " image_descriptions += lines[i] + \"\\n\"\n",
+ " i += 1\n",
+ " if i < len(lines):\n",
+ " image_descriptions += lines[i] + \"\\n\"\n",
+ "\n",
+ " chunks.append(image_descriptions.strip())\n",
+ " current_chunk = \"\"\n",
+ " else:\n",
+ " i += 1\n",
+ "\n",
+ " if current_chunk:\n",
+ " chunks.append(current_chunk.strip())\n",
+ "\n",
+ " combined_chunks = []\n",
+ " current_combined_chunk = \"\"\n",
+ " for chunk in chunks:\n",
+ " if len(current_combined_chunk) + len(chunk) <= chunk_size:\n",
+ " current_combined_chunk += chunk + \"\\n\\n\"\n",
+ " else:\n",
+ " if current_combined_chunk:\n",
+ " combined_chunks.append(current_combined_chunk.strip())\n",
+ " current_combined_chunk = chunk + \"\\n\\n\"\n",
+ "\n",
+ " if current_combined_chunk:\n",
+ " combined_chunks.append(current_combined_chunk.strip())\n",
+ "\n",
+ " return combined_chunks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {
+ "id": "_8lEUtyVdPeN"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def summarize_chunk(chunk, instruction, current_summary=\"\", iteration=1, model=\"claude-3-5-sonnet-20240620\"):\n",
+ " prompt = f\"\"\"Current summary:\n",
+ " {current_summary}\n",
+ "\n",
+ " New information:\n",
+ " {chunk}\n",
+ "\n",
+ " Instruction to focus on: {instruction}\n",
+ "\n",
+ " Iteration: {iteration}\n",
+ "\n",
+ " Create an extremely dense, highly technical summary that specifically addresses the given instruction. Follow these steps:\n",
+ "\n",
+ " 1. Identify 3-5 key technical points from the new information that are directly relevant to the instruction, prioritizing:\n",
+ " - Novel methodologies or algorithms related to the instruction\n",
+ " - Specific quantitative results or metrics that address the instruction\n",
+ " - Detailed experimental setups or parameters pertinent to the instruction\n",
+ " - Precise definitions of domain-specific concepts mentioned in the instruction\n",
+ " - Critical limitations or assumptions in the research that affect the instruction\n",
+ "\n",
+ " 2. Integrate these points with the current summary, ensuring:\n",
+ " - Direct relevance to the instruction at hand\n",
+ " - No redundancy or oversimplification\n",
+ " - Preservation of technical nuances and complexities specific to the instruction\n",
+ " - Inclusion of relevant equations, formulas, or mathematical notations that help address the instruction\n",
+ " - Accurate representation of statistical significance and error margins for instruction-related data\n",
+ "\n",
+ " 3. Rephrase the combined information to maximize information density while maintaining focus on the instruction:\n",
+ " - Use domain-specific terminology and jargon without simplification, as relevant to the instruction\n",
+ " - Maintain the level of detail expected in a PhD-level discourse on the specific topic of the instruction\n",
+ " - Incorporate precise citations or references where applicable to support the response\n",
+ " - Preserve any conflicting viewpoints or ongoing debates in the field that relate to the instruction\n",
+ "\n",
+ " 4. With each iteration, aim to increase information density by 30-40% without sacrificing technical accuracy or critical details that address the instruction.\n",
+ "\n",
+ " 5. Ensure the summary includes instruction-specific:\n",
+ " - Methodological details (e.g., exact algorithms, parameter settings) that are crucial to addressing the instruction\n",
+ " - Precise quantitative results with appropriate units and error bounds that directly relate to the instruction\n",
+ " - Detailed descriptions of novel techniques or approaches that are key to addressing the instruction\n",
+ " - Critical analysis of strengths and limitations in the research as they pertain to the instruction\n",
+ "\n",
+ " Produce a summary that is significantly more information-dense and technically precise than the previous one, while remaining laser-focused on addressing the given instruction. Use language appropriate for a highly specialized audience in the field.\"\"\"\n",
+ "\n",
+ " response = anthropic_client.messages.create(\n",
+ " model=model,\n",
+ " max_tokens=4096,\n",
+ " messages=[{\"role\": \"user\", \"content\": prompt}]\n",
+ " )\n",
+ " return response.content[0].text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {
+ "id": "8V-BUYhidPeN"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def summarize_chunk_summaries(instruction, current_summary, chunk_summaries, model=\"claude-3-opus-20240229\"):\n",
+ " return anthropic_client.messages.create(\n",
+ " model=model,\n",
+ " max_tokens=4096,\n",
+ " messages=[\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": f\"\"\"Given this current summary:\n",
+ "\n",
+ " {current_summary}\n",
+ "\n",
+ " And these chunk summaries:\n",
+ "\n",
+ " {' '.join(chunk_summaries)}\n",
+ "\n",
+ " And this instruction to focus on:\n",
+ "\n",
+ " {instruction}\n",
+ "\n",
+ " Create an extremely dense, final summary that refines the current summary by incorporating key information from the chunk summaries, while specifically addressing the given instruction. Follow these guidelines:\n",
+ "\n",
+ " 1. Integrate the most relevant and important information from the chunk summaries into the current summary.\n",
+ " 2. Ensure all key technical content from both the current summary and chunk summaries that relates to the instruction is retained.\n",
+ " 3. Aim to reduce overall length by 30-40% while increasing information density.\n",
+ " 4. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.\n",
+ " 5. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.\n",
+ " 6. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.\n",
+ " 7. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.\n",
+ " 8. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints from across all summaries.\n",
+ " 9. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.\n",
+ "\n",
+ " The final summary should be a highly concentrated, technical distillation of all provided summaries that specifically addresses the given instruction, suitable for specialists in the field.\"\"\",\n",
+ " }\n",
+ " ],\n",
+ " ).content[0].text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {
+ "id": "nUb8HjWzdPeN"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def summarize_chunk_iteration(chunks, instruction, current_summary, iteration, model):\n",
+ " chunk_summaries = []\n",
+ " for i, chunk in enumerate(chunks, 1):\n",
+ " current_summary = summarize_chunk(chunk, instruction, current_summary, iteration, model)\n",
+ " chunk_summaries.append(current_summary)\n",
+ " print(f\"Iteration {iteration}, Chunk {i}:\\n{current_summary}\\n\")\n",
+ " current_summary = summarize_chunk_summaries(instruction, current_summary, chunk_summaries, model)\n",
+ " print(f\"Iteration {iteration}, Final Summary:\\n{current_summary}\\n\")\n",
+ " return current_summary, chunk_summaries\n",
+ "\n",
+ "\n",
+ "@weave.op()\n",
+ "def iterative_chunk_summarization(chunks, instruction, current_summary, chunk_iterations, model):\n",
+ " chunk_iteration_summaries = []\n",
+ " chunk_summaries = []\n",
+ " for iteration in range(1, chunk_iterations + 1):\n",
+ " current_summary, iteration_chunk_summaries = summarize_chunk_iteration(chunks, instruction, current_summary, iteration, model)\n",
+ " chunk_iteration_summaries.append(current_summary)\n",
+ " chunk_summaries.append(iteration_chunk_summaries)\n",
+ " return current_summary, chunk_iteration_summaries, chunk_summaries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {
+ "id": "h8w8qIiGdPeN"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def chain_of_density_summarization(instruction, text, model=\"claude-3-5-sonnet-20240620\", chunk_size=8192, chunk_iterations=2, density_iterations=2):\n",
+ " chunks = chunk_text(text, chunk_size)\n",
+ " print(f\"Number of chunks: {len(chunks)}\")\n",
+ " print(f\"Chunk sizes: {[len(chunk) for chunk in chunks]}\")\n",
+ "\n",
+ " current_summary, chunk_iteration_summaries, chunk_summaries = iterative_chunk_summarization(chunks, instruction, \"\", chunk_iterations, model)\n",
+ " current_summary, iteration_summaries = iterative_density_summarization(instruction, current_summary, density_iterations, model)\n",
+ " final_summary_text = final_summary(instruction, current_summary, model)\n",
+ " print(f\"Final Summary:\\n{final_summary_text}\\n\")\n",
+ "\n",
+ " return {\n",
+ " \"final_summary\": final_summary_text,\n",
+ " \"accumulated_summary\": current_summary,\n",
+ " \"iteration_summaries\": iteration_summaries,\n",
+ " \"chunk_iteration_summaries\": chunk_iteration_summaries,\n",
+ " \"chunk_summaries\": chunk_summaries\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {
+ "id": "KvklW8LFdPeN"
+ },
+ "outputs": [],
+ "source": [
+ "class ArxivChainOfDensityPipeline(weave.Model):\n",
+ "\n",
+ " model: str = \"claude-3-5-sonnet-20240620\"\n",
+ " chunk_size: int = 20000\n",
+ " chunk_iterations: int = 1\n",
+ " density_iterations: int = 3\n",
+ " use_cache: bool = False\n",
+ " cache: dict = {}\n",
+ "\n",
+ " def __init__(self, model: str = \"claude-3-5-sonnet-20240620\", chunk_size: int = 4000, chunk_iterations: int = 1, density_iterations: int = 3, use_cache: bool = False):\n",
+ " super().__init__()\n",
+ " self.model = model\n",
+ " self.chunk_size = chunk_size\n",
+ " self.chunk_iterations = chunk_iterations\n",
+ " self.density_iterations = density_iterations\n",
+ " self.use_cache = use_cache\n",
+ " if use_cache:\n",
+ " self.cache = {}\n",
+ "\n",
+ " @weave.op()\n",
+ " def predict(self, paper: ArxivPaper, instruction: str) -> dict:\n",
+ "\n",
+ " if self.use_cache:\n",
+ " cache_key = (paper.entry_id, instruction)\n",
+ " if cache_key in self.cache:\n",
+ " return self.cache[cache_key]\n",
+ "\n",
+ " extracted_images = extract_images(paper)\n",
+ " cleaned_text = replace_images_with_descriptions(paper, extracted_images)\n",
+ " result = chain_of_density_summarization(instruction, cleaned_text, model=self.model, chunk_size=self.chunk_size, chunk_iterations=self.chunk_iterations, density_iterations=self.density_iterations)\n",
+ "\n",
+ " if self.use_cache:\n",
+ " self.cache[cache_key] = result\n",
+ "\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {
+ "id": "FtvQt4nkdPeO"
+ },
+ "outputs": [],
+ "source": [
+ "def process_chunk_summaries(model_output, instruction, model):\n",
+ " scores = {}\n",
+ " for i, chunk_list in enumerate(model_output[\"chunk_summaries\"]):\n",
+ " chunk_summary_scores = []\n",
+ " for j, summary in enumerate(chunk_list):\n",
+ " chunk_summary_score = score_summary(summary, f\"Chunk Summary {i+1}.{j+1}\", instruction, model)\n",
+ " chunk_summary_scores.append(chunk_summary_score)\n",
+ " scores[f\"chunk_summaries_analysis_{i+1}\"] = {\n",
+ " \"long_tail_stats\": calculate_long_tail_stats(chunk_summary_scores),\n",
+ " \"iteration_impact\": analyze_iteration_impact(chunk_summary_scores),\n",
+ " \"optimal_improvement_range\": find_optimal_improvement_range(chunk_summary_scores),\n",
+ " \"optimal_score_range\": find_optimal_score_range(chunk_summary_scores)\n",
+ " }\n",
+ " return scores\n",
+ "\n",
+ "\n",
+ "def process_chunk_iteration_summaries(model_output, instruction, model):\n",
+ " chunk_iteration_scores = [score_summary(summary, f\"Chunk Iteration Summary {i+1}\", instruction, model)\n",
+ " for i, summary in enumerate(model_output[\"chunk_iteration_summaries\"])]\n",
+ " return {\n",
+ " \"long_tail_stats\": calculate_long_tail_stats(chunk_iteration_scores),\n",
+ " # \"iteration_impact\": analyze_iteration_impact(chunk_iteration_scores),\n",
+ " # \"optimal_improvement_range\": find_optimal_improvement_range(chunk_iteration_scores),\n",
+ " # \"optimal_score_range\": find_optimal_score_range(chunk_iteration_scores)\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {
+ "id": "oKU5GpSBdPeO"
+ },
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def quality_scorer(instruction, model_output, model=\"gpt-4o\"):\n",
+ " scores = {\n",
+ " \"chunk_summaries_analysis\": {},\n",
+ " \"chunk_iteration_summaries_analysis\": {},\n",
+ " \"iteration_summaries_analysis\": {},\n",
+ " \"accumulated_summary\": {},\n",
+ " \"final_summary\": {}\n",
+ " }\n",
+ "\n",
+ " try:\n",
+ " # Process chunk summaries\n",
+ " chunk_summaries_scores = process_chunk_summaries(model_output, instruction, model)\n",
+ " scores.update(chunk_summaries_scores)\n",
+ "\n",
+ " # Process chunk iteration summaries\n",
+ " scores[\"chunk_iteration_summaries_analysis\"] = process_chunk_iteration_summaries(model_output, instruction, model)\n",
+ "\n",
+ " # Process iteration summaries\n",
+ " scores[\"iteration_summaries_analysis\"] = process_iteration_summaries(model_output, instruction, model)\n",
+ "\n",
+ " # Score accumulated summary\n",
+ " scores[\"accumulated_summary\"] = score_summary(model_output[\"accumulated_summary\"], \"Accumulated Summary\", instruction, model)\n",
+ "\n",
+ " # Score final summary\n",
+ " scores[\"final_summary\"] = score_summary(model_output[\"final_summary\"], \"Final Summary\", instruction, model)\n",
+ "\n",
+ " # After calculating all scores\n",
+ " flattened_scores = {}\n",
+ " for key, value in scores.items():\n",
+ " if isinstance(value, dict):\n",
+ " flattened_scores[key] = flatten_dict(value)\n",
+ " else:\n",
+ " flattened_scores[key] = value\n",
+ "\n",
+ " scores = flatten_dict(flattened_scores)\n",
+ "\n",
+ " except Exception as e:\n",
+ " print(f\"Error in quality_scorer: {str(e)}\")\n",
+ " scores[\"error\"] = str(e)\n",
+ "\n",
+ " return scores"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {
+ "id": "9Cm3Een1dPeO"
+ },
+ "outputs": [],
+ "source": [
+ "# evaluation = weave.Evaluation(dataset=dataset, scorers=[quality_scorer])\n",
+ "# for model in models:\n",
+ "# arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline(model=model, density_iterations=1, chunk_iterations=1)\n",
+ "# await evaluation.evaluate(arxiv_chain_of_density_pipeline)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/colabs/anthropic/summarization/extras/animation.py b/colabs/anthropic/summarization/extras/animation.py
new file mode 100644
index 00000000..7de0d89b
--- /dev/null
+++ b/colabs/anthropic/summarization/extras/animation.py
@@ -0,0 +1,218 @@
+from manim import (
+ Text,
+ VGroup,
+ SurroundingRectangle,
+ Arrow,
+ Create,
+ Write,
+ FadeOut,
+ Transform,
+ Scene,
+ np,
+ RIGHT,
+ UP,
+ DOWN,
+ LEFT,
+ WHITE,
+ GREEN,
+ YELLOW,
+ RED,
+)
+import numpy as np
+
+
+class ChunkingAnalysisAnimation(Scene):
+ def construct(self):
+ # Scene 1: Introduction
+ title = Text(
+ "Optimizing Chunks in Chain-of-Density Summarization", font_size=40
+ )
+ self.play(Write(title))
+ self.wait(2)
+ self.play(FadeOut(title))
+
+ # Scene 2: Visualizing Scores Over Iterations
+ score_group, score_boxes = self.show_score_progression()
+
+ # Scene 3: Analyzing Optimal Improvement Range
+ self.show_optimal_improvement_range(score_group)
+
+ # Scene 4: Finding Optimal Score Range
+ self.show_optimal_score_range(score_group, score_boxes)
+
+ # Scene 5: Conclusion
+ self.show_conclusion()
+
+ def show_score_progression(self):
+ scores = [3.0, 2.5, 4.0, 3.8, 5.5, 4.2, 6.8, 5.1, 7.0, 6.3, 8.3, 7.5]
+ score_texts = [Text(f"{score:.1f}", font_size=24) for score in scores]
+ score_group = VGroup(*score_texts).arrange(RIGHT, buff=0.7).to_edge(UP, buff=1)
+
+ # Create boxes around each score
+ score_boxes = VGroup(
+ *[SurroundingRectangle(text, color=WHITE) for text in score_texts]
+ )
+
+ self.play(Write(score_group), Create(score_boxes))
+ self.wait(2)
+
+ explanation = Text("Scores over chunks", font_size=24)
+ explanation.next_to(score_group, DOWN, buff=0.5)
+ self.play(Write(explanation))
+ self.wait(2)
+ self.play(FadeOut(explanation))
+
+ return score_group, score_boxes
+
+ def show_optimal_improvement_range(self, score_group):
+ scores = [3.0, 2.5, 4.0, 3.8, 5.5, 4.2, 6.8, 5.1, 7.0, 6.3, 8.3, 7.5]
+ improvements = [scores[i + 1] - scores[i] for i in range(len(scores) - 1)]
+ improvement_texts = [Text(f"{imp:+.1f}", font_size=24) for imp in improvements]
+ improvement_group = (
+ VGroup(*improvement_texts)
+ .arrange(RIGHT, buff=0.7)
+ .next_to(score_group, DOWN, buff=1.5)
+ )
+
+ # Create boxes around each improvement
+ improvement_boxes = VGroup(
+ *[SurroundingRectangle(text, color=WHITE) for text in improvement_texts]
+ )
+
+ self.play(Write(improvement_group), Create(improvement_boxes))
+ self.wait(2)
+
+ # Calculate moving average of improvements
+ window_size = 3
+ moving_avg = [
+ np.mean(improvements[i : i + window_size])
+ for i in range(len(improvements) - window_size + 1)
+ ]
+ threshold = 0.5 * np.mean(improvements) # 50% of mean improvement
+
+ # Find the optimal range
+ optimal_start = next(
+ (i for i, avg in enumerate(moving_avg) if avg >= threshold), 0
+ )
+ optimal_end = next(
+ (
+ i
+ for i in range(len(moving_avg) - 1, -1, -1)
+ if moving_avg[i] >= threshold
+ ),
+ len(moving_avg) - 1,
+ )
+
+ optimal_range_box = SurroundingRectangle(
+ improvement_group[optimal_start : optimal_end + 1], color=GREEN
+ )
+ self.play(Create(optimal_range_box))
+
+ arrow = Arrow(
+ start=optimal_range_box.get_bottom(),
+ end=optimal_range_box.get_bottom() + DOWN,
+ color=GREEN,
+ )
+ self.play(Create(arrow))
+
+ explanation = Text(
+ f"Optimal improvement range: chunks {optimal_start}-{optimal_end+2}",
+ font_size=24,
+ )
+ explanation.next_to(arrow, DOWN, buff=0.5)
+ self.play(Write(explanation))
+ self.wait(2)
+ self.play(
+ FadeOut(improvement_group),
+ FadeOut(improvement_boxes),
+ FadeOut(optimal_range_box),
+ FadeOut(arrow),
+ FadeOut(explanation),
+ )
+
+ def show_optimal_score_range(self, score_group, score_boxes):
+ scores = [3.0, 2.5, 4.0, 3.8, 5.5, 4.2, 6.8, 5.1, 7.0, 6.3, 8.3, 7.5]
+ highest_score = max(scores)
+ highest_score_index = scores.index(highest_score)
+
+ # Highlight the highest score
+ highest_score_box = score_boxes[highest_score_index].copy().set_color(YELLOW)
+ self.play(
+ score_group[highest_score_index].animate.set_color(YELLOW),
+ Transform(score_boxes[highest_score_index], highest_score_box),
+ )
+ self.wait(1)
+
+ # Find the range with the highest cumulative improvement leading to the highest score
+ best_start = 0
+ best_end = highest_score_index
+ best_improvement = sum(
+ scores[i + 1] - scores[i] for i in range(highest_score_index)
+ )
+ for start in range(highest_score_index):
+ current_improvement = sum(
+ scores[i + 1] - scores[i] for i in range(start, highest_score_index)
+ )
+ if current_improvement > best_improvement:
+ best_start = start
+ best_improvement = current_improvement
+
+ optimal_score_box = SurroundingRectangle(
+ score_group[best_start : best_end + 1], color=RED
+ )
+ self.play(Create(optimal_score_box))
+
+ arrow = Arrow(
+ start=optimal_score_box.get_bottom(),
+ end=optimal_score_box.get_bottom() + DOWN,
+ color=RED,
+ )
+ self.play(Create(arrow))
+
+ explanation = Text(
+ f"Optimal score range: chunks {best_start}-{best_end}", font_size=24
+ )
+ explanation.next_to(arrow, DOWN, buff=0.5)
+ self.play(Write(explanation))
+ self.wait(2)
+ self.play(
+ FadeOut(score_group),
+ FadeOut(score_boxes),
+ FadeOut(optimal_score_box),
+ FadeOut(arrow),
+ FadeOut(explanation),
+ )
+
+ def show_conclusion(self):
+ conclusion = (
+ VGroup(
+ Text("Optimal Chunking Strategy 🎯", font_size=36),
+ Text(
+ "1. 📈 Identify the 'sweet spot' with highest improvement rates",
+ font_size=24,
+ ),
+ Text(
+ "2. 🔍 Prioritize chunks in both optimal improvement and high-score ranges",
+ font_size=24,
+ ),
+ Text(
+ "3. 🔬 Expand chunk sizes in low-improvement and peak-score areas",
+ font_size=24,
+ ),
+ Text(
+ "4. ⚖️ Balance rapid early gains with targeted later refinements",
+ font_size=24,
+ ),
+ Text(
+ "5. 🔄 Iteratively reassess ranges throughout the summarization process",
+ font_size=24,
+ ),
+ )
+ .arrange(DOWN, aligned_edge=LEFT, buff=0.5)
+ .to_edge(LEFT, buff=1)
+ .shift(UP * 0.5)
+ )
+
+ self.play(Write(conclusion))
+ self.wait(3)
+ self.play(FadeOut(conclusion))
diff --git a/colabs/anthropic/summarization/index.md b/colabs/anthropic/summarization/index.md
new file mode 100644
index 00000000..2c38c159
--- /dev/null
+++ b/colabs/anthropic/summarization/index.md
@@ -0,0 +1,1348 @@
+# Arxiv PDF Summarization Bot using Chain of Density
+
+[](https://colab.research.google.com/github/wandb/weave/blob/add-summarization-example/examples/cookbooks/summarization/chain_of_density_arxiv.ipynb)
+
+
+This cookbook walks through the implementation of an AI-powered summarization bot that extracts concise, information-dense summaries from Arxiv papers using the Chain of Density technique. We'll use Anthropic's Claude API, the Arxiv API, PyPDF2 for PDF processing, and Weave for experiment tracking and evaluation.
+
+## Setup and Imports
+
+First, let's set up our environment and import the necessary libraries we'll need for our project, including:
+- `anthropic` for interacting with Claude API
+- `arxiv` for fetching paper metadata and PDFs
+- `PyPDF2` for PDF text extraction
+- `weave` for experiment tracking and evaluation
+
+## Initializing Weave and Anthropic Client
+
+Next, we initialize Weave for experiment tracking and set up the Anthropic client:
+
+```python
+weave.init("arxiv-chain-of-density-summarization")
+anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+```
+
+This sets up Weave with a specific project name and initializes the Anthropic client using an API key stored in the environment variables.
+
+## Optional: Fetching Arxiv Papers
+
+
+Click to Expand
+
+We implement functions to fetch relevant papers from the Arxiv database:
+
+
+
+```python
+@weave.op()
+def generate_arxiv_query_args(instruction, model="claude-3-sonnet-20240229"):
+ # Define the tools available to the LLM
+ tools = [{
+ "name": "prepare_arxiv_search",
+ "description": "Prepare arguments for ArXiv paper search. This tool generates an optimal query string utilizing Boolean operators, field-specific syntax, and precise search terms. It also determines an efficient maximum number of results to fetch, balancing comprehensive coverage with processing efficiency. The output is tailored to the given research instruction, aiming to provide relevant and focused search results.",
+ "input_schema": {
+ "type": "object",
+ "properties": {
+ "query": {
+ "type": "string",
+ "description": "The ArXiv search query string. Supports Boolean operators (AND, OR, NOT), field-specific syntax (e.g., 'ti:' for title, 'au:' for author), quotation marks for exact phrases, and wildcards. Can include multiple search terms to refine results based on title, abstract, authors, comments, journal reference, subject category, or report number."
+ },
+ "max_results": {
+ "type": "integer",
+ "description": "The maximum number of paper results to return from the ArXiv search. Aims to minimize the number of results while ensuring sufficient coverage of the topic. Defaults to 5 if not specified. Increasing this value broadens the search but may increase processing time and resource usage. Aim to be below 10 articles."
+ }
+ },
+ "required": ["query", "max_results"]
+ }
+ }]
+
+ # Define the system prompt for the LLM
+ system_prompt = """You are an expert at generating ArXiv queries. Use the prepare_arxiv_search tool to create an optimal query and determine the appropriate maximum number of results for the given research question. The query should utilize advanced search techniques including Boolean operators, field-specific syntax, and precise terms to ensure comprehensive yet focused results."""
+
+ # Create the user message with the instruction
+ messages = [
+ {
+ "role": "user",
+ "content": f"Use the prepare_arxiv_search tool to generate an optimal ArXiv query and determine the maximum number of results for the following research instruction: {instruction}"
+ }
+ ]
+
+ # Make the API call to the LLM
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=messages,
+ system=system_prompt,
+ tools=tools
+ )
+
+ # Extract the query and max_results from the response
+ for content in response.content:
+ if content.type == 'tool_use' and content.name == 'prepare_arxiv_search':
+ args = content.input
+ return args.get('query'), args.get('max_results')
+
+ # If no tool use was found, return a default query and the provided max_results
+ return f"{instruction}", 5
+```
+
+
+
+```python
+@weave.op()
+def fetch_arxiv_papers(query, max_results=5):
+ # Initialize the arxiv Client
+ arxiv_client = arxiv.Client()
+
+ # Create the search object with the provided query and max_results
+ search = arxiv.Search(
+ query=query,
+ max_results=max_results,
+ sort_by=arxiv.SortCriterion.Relevance,
+ sort_order=arxiv.SortOrder.Descending
+ )
+
+ # Fetch the results using client.results() and convert them to ArxivPaper objects
+ papers = []
+ for result in arxiv_client.results(search):
+ # Convert the raw arxiv result to our custom ArxivPaper object
+ paper = convert_raw_arxiv_to_pydantic(result)
+ papers.append(paper)
+
+ return papers
+```
+
+These functions use Claude to generate an optimal Arxiv search query based on a given instruction and then fetch the relevant papers using the Arxiv API.
+
+
+
+## Creating Sample Arxiv Paper Objects
+
+For demonstration purposes, we create sample `ArxivPaper` objects:
+
+
+
+```python
+arxiv_paper = ArxivPaper(
+ entry_id="http://arxiv.org/abs/2406.04744v1",
+ updated=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
+ published=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
+ title="CRAG -- Comprehensive RAG Benchmark",
+ authors=[Author(full_name="Xiao Yang")],
+ summary="CRAG: A benchmark for Retrieval-Augmented Generation (RAG) with 4,409 QA pairs across diverse domains.",
+ doi="10.48550/arXiv.2406.04744",
+ primary_category="cs.CL",
+ pdf_url="https://arxiv.org/pdf/2406.04744"
+)
+```
+
+This creates an `ArxivPaper` object with metadata about a specific paper, including its title, authors, summary, and PDF URL. The most important part of this object is the `pdf_url` field, which contains the location of the PDF file.
+
+## PDF Processing
+
+We implement functions to load and process PDFs:
+
+```python
+def load_pdf(arxiv_result):
+ pdf_url = arxiv_result["pdf_url"]
+ response = requests.get(pdf_url)
+ pdf_file = io.BytesIO(response.content)
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
+ return pdf_reader
+```
+
+These functions handle loading PDFs into PyPDF2 making processing the PDFs easier and more efficient.
+
+## Converting PDF Images to Text
+
+One of the key challenges in processing academic papers is handling the visual content, which often includes both raster images and vector graphics. These visuals can contain crucial information that needs to be incorporated into our summarization process. To address this, we leverage Claude 3 Sonnet's advanced vision capabilities to convert these images into detailed textual descriptions.
+
+Here's the implementation of our main image processing function:
+
+
+
+```python
+import base64
+import io
+from pdf2image import convert_from_bytes
+from PIL import Image
+
+@weave.op()
+def extract_images(paper, model="claude-3-5-sonnet-20240620"):
+ pdf_reader = load_pdf(paper)
+ all_images = []
+
+ for page_num, page in enumerate(pdf_reader.pages):
+ images = []
+
+ # Process raster images
+ for image in page.images:
+ img_data = image.data
+ kind = filetype.guess(img_data)
+ if kind is None:
+ print(f"Cannot guess file type for image on page {page_num + 1}")
+ continue
+
+ img_str = base64.b64encode(img_data).decode("utf-8")
+ data_url = f"data:{kind.mime};base64,{img_str}"
+ try:
+ images.append(
+ {"image": data_url, "description": process_figure_image(data_url, model=model)}
+ )
+ except Exception as e:
+ print(f"Error processing image on page {page_num + 1}: {e}")
+ images.append({"image": data_url, "description": ""})
+
+ # Process vector graphics
+ vector_graphics_image_data_url = convert_vector_graphic_page_to_image(page)
+ if vector_graphics_image_data_url:
+ images.append({
+ "image": vector_graphics_image_data_url,
+ "description": process_vector_image_pdf(vector_graphics_image_data_url, model=model)
+ })
+
+ all_images.append(images)
+
+ return all_images
+```
+
+Let's break down the key components and challenges:
+
+### 1. Handling Raster Images
+
+Raster images are typically stored as embedded objects within the PDF. We extract these using PyPDF2's built-in functionality:
+
+```python
+for image in page.images:
+ img_data = image.data
+ # ... process the image data
+```
+
+The challenge here is that these images can be in various formats (PNG, JPEG, etc.). We use the `filetype` library to guess the MIME type, which is crucial for creating a valid data URL:
+
+```python
+kind = filetype.guess(img_data)
+if kind is None:
+ print(f"Cannot guess file type for image on page {page_num + 1}")
+ continue
+
+img_str = base64.b64encode(img_data).decode("utf-8")
+data_url = f"data:{kind.mime};base64,{img_str}"
+```
+
+### 2. Handling Vector Graphics
+
+Vector graphics present a unique challenge because they're not stored as traditional image files within the PDF. Instead, they're often part of the page's content stream. To handle these, we need to convert the entire page to an image:
+
+```python
+vector_graphics_image_data_url = convert_vector_graphic_page_to_image(page)
+if vector_graphics_image_data_url:
+ images.append({
+ "image": vector_graphics_image_data_url,
+ "description": process_vector_image_pdf(vector_graphics_image_data_url, model=model)
+ })
+```
+
+The `convert_vector_graphic_page_to_image` function (collapsed below) uses `pdf2image` to convert the PDF page to a PNG image. This ensures we capture all vector graphics, but it also means we might capture text and other elements on the page.
+
+
+Click to Expand
+
+```python
+def convert_vector_graphic_page_to_image(pdf_page, scale_factor=0.5):
+ # Helper function to handle indirect PDF objects
+ def get_object(obj):
+ if isinstance(obj, PyPDF2.generic.IndirectObject):
+ return obj.get_object()
+ return obj
+
+ # Extract resources from the PDF page
+ resources = get_object(pdf_page.get('/Resources', {}))
+ xobject = get_object(resources.get('/XObject', {}))
+
+ # Check if there's a figure that's not a raster image (i.e., a vector graphic)
+ if xobject:
+ for obj in xobject.values():
+ obj = get_object(obj)
+ # Check if the object is a Form XObject, which typically represents vector graphics
+ if isinstance(obj, dict) and obj.get('/Subtype') == '/Form':
+ # Convert the page to a temporary PDF file in memory
+ pdf_bytes = io.BytesIO()
+ pdf_writer = PyPDF2.PdfWriter()
+ pdf_writer.add_page(pdf_page)
+ pdf_writer.write(pdf_bytes)
+ pdf_bytes.seek(0)
+
+ # Use pdf2image to convert the PDF to a PNG image
+ images = convert_from_bytes(pdf_bytes.getvalue(), fmt='png')
+
+ if images:
+ image = images[0]
+ # Resize the image to reduce memory usage and processing time
+ new_size = (int(image.width * scale_factor), int(image.height * scale_factor))
+ image = image.resize(new_size, Image.LANCZOS)
+
+ # Convert the image to a base64-encoded string
+ img_byte_arr = io.BytesIO()
+ image.save(img_byte_arr, format='PNG')
+ img_byte_arr = img_byte_arr.getvalue()
+ img_str = base64.b64encode(img_byte_arr).decode("utf-8")
+
+ # Return the image as a data URL
+ return f"data:image/png;base64,{img_str}"
+
+ # Return None if no vector graphics were found or conversion was not needed
+ return None
+```
+
+This approach ensures that all vector graphics on the page are captured, even if they can't be directly extracted as separate objects. However, it's important to note that this method will also capture all other content on the page, which may require additional processing or filtering in subsequent steps of the analysis pipeline.
+
+
+
+
+### 3. Using Claude 3 Sonnet for Image Description
+
+The core of our image processing lies in the `process_figure_image` and `process_vector_image_pdf` functions. These functions use Claude 3 Sonnet's vision capabilities to generate detailed descriptions of the images:
+
+```python
+@weave.op()
+def process_figure_image(data_url, model="claude-3-5-sonnet-20240620"):
+ img_str = data_url.split(",")[1]
+
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/png",
+ "data": img_str,
+ },
+ },
+ {
+ "type": "text",
+ "text": """Analyze this image as if it's a figure from a scientific research paper. Provide a detailed technical description addressing the following:
+
+1. Type of figure (e.g., graph, diagram, flowchart, experimental setup)
+2. Key components or variables represented
+3. Relationships or trends depicted
+4. Quantitative information (if present)
+5. Methodology or process illustrated (if applicable)
+6. Potential implications or conclusions that can be drawn
+7. Any limitations or assumptions evident in the figure
+
+Focus on technical accuracy and relevance to scientific research. Avoid general descriptions and concentrate on the specific scientific content presented.""",
+ },
+ ],
+ }
+ ],
+ )
+ return response.content[0].text
+
+@weave.op()
+def process_vector_image_pdf(data_url, model="claude-3-5-sonnet-20240620"):
+ img_str = data_url.split(",")[1]
+
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/png",
+ "data": img_str,
+ },
+ },
+ {
+ "type": "text",
+ "text": """This image is a full page from a scientific paper PDF, converted to PNG format. It may contain one or more vector graphic figures or charts. Your task is to:
+
+1. Identify and focus solely on the vector graphic figures or charts within the page.
+2. For each identified figure or chart, provide a detailed technical analysis addressing:
+
+ a. Type of figure (e.g., graph, diagram, flowchart)
+ b. Key components or variables represented
+ c. Relationships or trends depicted
+ d. Quantitative information (if present)
+ e. Methodology or process illustrated (if applicable)
+ f. Potential implications or conclusions that can be drawn
+
+3. Ignore any text or other elements on the page that are not part of the vector graphic figures.
+4. If multiple figures are present, analyze each separately and clearly indicate which figure you are describing.
+
+Focus on providing accurate, technical descriptions of the vector graphic content only.""",
+ },
+ ],
+ }
+ ],
+ )
+ return response.content[0].text
+```
+
+> The prompts for `process_figure_image` and `process_vector_image_pdf` are tailored to handle different scenarios:
+>
+> 1. **Figure Image Prompt:**
+> - Assumes a single, isolated figure
+> - Focuses on detailed analysis of the specific figure
+> - Includes points about limitations and assumptions
+>
+> 2. **Vector Image PDF Prompt:**
+> - Assumes a full page that may contain multiple vector graphics
+> - Instructs to identify and focus only on vector graphic elements
+> - Asks for separate analysis of each figure if multiple are present
+> - Explicitly tells to ignore text and non-vector graphic elements
+>
+> These differences ensure that Claude 3 Sonnet can accurately process and describe both individual figures and complex pages with multiple vector graphics.
+
+This approach allows us to handle the nuances of different image types within scientific papers. The figure image prompt is designed for standalone images, while the vector image prompt is tailored for full pages that may contain multiple graphics alongside text and other elements.
+
+### 4. Integrating Image Descriptions into the Text
+
+Finally, we integrate the image descriptions into the text of the paper:
+
+
+
+```python
+@weave.op()
+def replace_images_with_descriptions(paper, images):
+ # ... (previous code)
+ if images[page_num] and len(images[page_num]) > 0:
+ text += f"\n\n[Image Descriptions for page {page_num+1}]\n"
+ for image_num, image in enumerate(images[page_num]):
+ text += f"\n[Image {image_num+1}]: {image['description']}\n"
+ text += "[END OF IMAGE DESCRIPTIONS]\n"
+ # ... (rest of the function)
+```
+
+This approach ensures that the image descriptions are clearly demarcated within the text, making it easier for our summarization pipeline to incorporate this visual information.
+
+By implementing this comprehensive image processing pipeline, we ensure that our Chain of Density summarization process can incorporate crucial information from both textual and visual elements of academic papers. This is particularly important for fields where figures and diagrams play a significant role in conveying research findings.
+
+## Chain of Density Summarization
+
+The core of our summarization pipeline is then implemented in the following functions:
+
+
+
+### `summarize_current_summary`:
+ - Forms the foundation of our Chain of Density implementation
+ - Utilizes a carefully crafted prompt to guide the language model
+ - Instructs the model to identify new technical entities
+ - Incorporates new entities into the summary
+ - Increases overall information density while maintaining relevance to the given instruction
+
+```python
+@weave.op()
+def summarize_current_summary(document, instruction, current_summary="", iteration=1, model="claude-3-5-sonnet-20240620"):
+ # Define the maximum number of tokens for the model's response
+ max_tokens = 4096
+
+ # Construct the prompt for the LLM
+ prompt = f"""
+ Document:
+ {document}
+
+ Current summary:
+ {current_summary}
+
+ Instruction to focus on: {instruction}
+
+ Iteration: {iteration}
+
+ Generate an increasingly concise, entity-dense, and highly technical summary from the provided document that specifically addresses the given instruction using the below approach:
+
+ 1. Carefully read the current summary and the instruction.
+
+ 2. Identify 1-3 new, important technical entities or ideas from the original text that:
+ - Are directly relevant to the instruction
+ - Are not yet present in the current summary
+ - Add significant, specific information to the summary
+ - Are preferably 5 words or fewer
+ - May include methodologies, algorithms, metrics, or key findings
+ - Ensure to include this in the output before the summary
+
+ 3. Write a new summary that:
+ - Incorporates the newly identified entities/ideas
+ - Retains all crucial information from the current summary
+ - Increases overall information density
+ - Remains focused on addressing the instruction
+ - Utilizes the response window of {max_tokens} tokens
+
+ Guidelines:
+ - Prioritize technical accuracy and specificity over general readability
+ - Use precise terminology, domain-specific jargon, and include quantitative details where relevant
+ - Ensure all information is directly related to the instruction
+ - Make every word count: rewrite to improve density and make space for new technical entities
+ - Employ fusion, compression, and removal of less informative phrases to increase density
+ - Never drop entities or technical details from the current summary that are relevant to the instruction
+ - Maintain coherence while maximizing information density
+
+ Your goal is to create a summary that is noticeably denser, more technical, and more informative than the previous one, utilizing the response window of {max_tokens} tokens while staying laser-focused on the instruction. The summary should be suitable for an expert audience in the field."""
+
+ # Make the API call to the LLM
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=max_tokens,
+ messages=[{"role": "user", "content": prompt}]
+ )
+
+ # Return the generated summary
+ return response.content[0].text
+```
+
+### `iterative_density_summarization`:
+ - Orchestrates the iterative refinement process
+ - Repeatedly calls `summarize_current_summary`
+ - Uses each iteration's output as input for the next
+ - Allows for gradual accumulation of technical details
+ - Increases density of information progressively
+
+```python
+@weave.op()
+def iterative_density_summarization(document, instruction, current_summary, density_iterations, model):
+ # Initialize a list to store summaries from each iteration
+ iteration_summaries = []
+
+ # Iterate through the specified number of density iterations
+ for iteration in range(1, density_iterations + 1):
+ # Generate a new summary based on the current summary and document
+ current_summary = summarize_current_summary(document, instruction, current_summary, iteration, model)
+
+ # Add the new summary to the list of iteration summaries
+ iteration_summaries.append(current_summary)
+
+ # Print the current iteration and summary for monitoring
+ print(f"Iteration {iteration}:\n{current_summary}\n")
+
+ # Return the final summary and the list of all iteration summaries
+ return current_summary, iteration_summaries
+```
+
+### `final_summary`:
+ - Performs a final condensation step after the iterative process
+ - Aims to reduce summary length by 30-40%
+ - Retains all critical technical content
+ - Optimizes for maximum information density and relevance to the instruction
+
+```python
+@weave.op()
+def final_summary(instruction, current_summary, model):
+ # Construct the prompt for the final summary generation
+ prompt = f"""Given this summary:
+
+{current_summary}
+
+And this instruction to focus on:
+
+{instruction}
+
+Create an extremely dense, final summary that captures all key technical information in the most concise form possible, while specifically addressing the given instruction. Follow these guidelines:
+
+1. Aim to reduce length by 30-40% while retaining all critical technical content relevant to the instruction.
+2. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.
+3. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.
+4. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.
+5. Ensure that all key entities and concepts from the original summary that relate to the instruction are represented.
+6. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.
+7. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints.
+8. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.
+
+The final summary should be a highly concentrated, technical distillation of the research that specifically addresses the given instruction, suitable for specialists in the field."""
+
+ # Make the API call to the LLM for the final summary
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[{"role": "user", "content": prompt}]
+ )
+
+ # Return the generated final summary
+ return response.content[0].text
+```
+
+### `chain_of_density_summarization`:
+ - Serves as the main entry point for the summarization process
+ - Coordinates the entire summarization pipeline
+ - Initiates the iterative summarization
+ - Applies the final condensation
+ - Returns a comprehensive result set including:
+ - Final summary
+ - Accumulated summary
+ - All intermediate summaries
+
+```python
+@weave.op()
+def chain_of_density_summarization(document, instruction, current_summary="", model="claude-3-5-sonnet-20240620", density_iterations=2):
+ # Perform iterative density summarization
+ current_summary, iteration_summaries = iterative_density_summarization(document, instruction, current_summary, density_iterations, model)
+
+ # Generate the final, highly condensed summary
+ final_summary_text = final_summary(instruction, current_summary, model)
+
+ # Print the final summary for monitoring
+ print(f"Final Summary:\n{final_summary_text}\n")
+
+ # Return a dictionary containing all generated summaries
+ return {
+ "final_summary": final_summary_text,
+ "accumulated_summary": current_summary,
+ "iteration_summaries": iteration_summaries,
+ }
+```
+
+This implementation leverages the Chain of Density technique to produce increasingly dense and informative summaries. By iteratively refining the summary and focusing on technical entities and ideas, it generates concise yet highly informative summaries tailored to specific instructions. The process prioritizes technical accuracy, domain-specific terminology, and quantitative details, making it particularly suitable for summarizing complex scientific documents for expert audiences.
+
+## Weave Model Object
+
+We create a Weave Model object to encapsulate our summarization pipeline:
+
+
+
+```python
+class ArxivChainOfDensityPipeline(weave.Model):
+ model: str = "claude-3-5-sonnet-20240620"
+ density_iterations: int = 3
+
+ def __init__(self, model: str = "claude-3-5-sonnet-20240620", density_iterations: int = 3):
+ super().__init__()
+ self.model = model
+ self.density_iterations = density_iterations
+
+ @weave.op()
+ def predict(self, paper: ArxivPaper, instruction: str) -> dict:
+ extracted_images = extract_images(paper)
+ cleaned_text = replace_images_with_descriptions(paper, extracted_images)
+ result = chain_of_density_summarization(cleaned_text, instruction, model=self.model, density_iterations=self.density_iterations)
+ return result
+```
+
+This class encapsulates our summarization pipeline as a Weave Model. By inheriting from `weave.Model` and using the `@weave.op()` decorator, we enable automatic versioning and tracking of inputs, outputs, and code changes. This makes it easy to reproduce experiments and compare results across different model versions or parameter settings.
+
+## Evaluation Dataset
+
+We create an evaluation dataset using sample Arxiv papers and instructions:
+
+
+
+```python
+eval_papers = [arxiv_paper3]
+eval_instructions = [
+ "Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field.",
+]
+
+eval_data = list(product(eval_papers, eval_instructions))
+dataset = weave.Dataset(name="we-paper-reading-eval-data", rows=[{"paper": arxiv_paper, "instruction": instruction, "summary": arxiv_paper.summary} for arxiv_paper, instruction in eval_data])
+weave.publish(dataset)
+```
+
+This creates a Weave Dataset object that combines papers, instructions, and original summaries for evaluation. The `weave.Dataset` class allows us to version and track our evaluation data, ensuring reproducibility of our experiments. By publishing the dataset with `weave.publish()`, we make it available for future use and comparison.
+
+## Evaluation Metrics
+
+We implement several evaluation metrics to assess the quality of our summaries:
+
+
+
+```python
+@weave.op()
+def score_summary(summary, summary_type, instruction, model):
+ openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+ # Construct a detailed prompt for the GPT model to evaluate the summary
+ prompt = f"""Evaluate the quality of the following {summary_type} based on how well it addresses the given instruction. Use the scoring rules below to calculate three numerical scores between 0 and 10.
+
+Instruction: {instruction}
+
+{summary_type}:
+{summary}
+
+Scoring Rules:
+1. Relevance (0-5): [Detailed scoring criteria for relevance]
+2. Technical Quality (0-5): [Detailed scoring criteria for technical quality]
+3. Conciseness (0-5): [Detailed scoring criteria for conciseness]
+
+Provide your evaluation in the following JSON format:
+{{
+ "relevance": {{
+ "score":
+ }},
+ "technical_quality": {{
+ "score":
+ }},
+ "conciseness": {{
+ "score":
+ }}
+}}
+
+Ensure your response is ONLY valid JSON. Do not include any other text outside the JSON object.
+Ensure you have the keys: relevance, technical_quality, conciseness, each containing only a score.
+Ensure each score is a float between 0 and 10, using the scoring rules provided above.
+"""
+
+ # Make an API call to the GPT model for evaluation
+ response = openai_client.chat.completions.create(
+ model=model,
+ messages=[{"role": "user", "content": prompt}],
+ response_format={"type": "json_object"}
+ )
+
+ # Parse and return the JSON response
+ return json.loads(response.choices[0].message.content)
+```
+
+This function uses GPT-4 to evaluate individual summaries based on three criteria:
+- Relevance
+- Technical quality
+- Conciseness
+
+Benefits:
+- Captures nuanced aspects of summary quality
+- Provides a holistic assessment of how well the summary addresses the given instruction
+- Evaluates technical accuracy while considering conciseness
+
+---
+
+```python
+@weave.op()
+def calculate_long_tail_stats(scores):
+ if not scores:
+ return None
+
+ aspects = ['relevance', 'technical_quality', 'conciseness']
+ stats = {}
+
+ for aspect in aspects:
+ try:
+ # Handle different input formats (list of lists or list of dicts)
+ if isinstance(scores[0], list):
+ flattened_scores = [score[aspect]['score'] for sublist in scores for score in sublist]
+ elif isinstance(scores[0], dict):
+ flattened_scores = [score[aspect]['score'] for score in scores]
+ else:
+ print(f"Unexpected format for scores: {scores}")
+ return None
+
+ # Calculate statistics for each aspect
+ stats[aspect] = {
+ "mean": np.mean(flattened_scores),
+ "tail_ratio": np.mean(sorted(flattened_scores)[-max(1, int(len(flattened_scores)*0.05)):]) / np.mean(flattened_scores),
+ }
+ except Exception as e:
+ print(f"Error calculating stats for {aspect}: {str(e)}")
+ stats[aspect] = None
+
+ return stats
+```
+
+This function:
+- Analyzes the distribution of scores across multiple summaries
+- Calculates for each aspect (relevance, technical quality, conciseness):
+ - Mean score
+ - "Tail ratio" (average of top 5% scores compared to overall mean)
+
+Usefulness:
+- Helps identify potential outliers or exceptionally high-quality summaries
+- Provides insight into overall performance of the summarization process
+- Highlights areas where the model excels or needs improvement
+
+---
+
+```python
+@weave.op()
+def analyze_iteration_impact(scores):
+ if len(scores) < 2:
+ return {aspect: {"diminishing_returns_point": 0, "cumulative_improvement": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}
+
+ aspects = ['relevance', 'technical_quality', 'conciseness']
+ results = {}
+
+ for aspect in aspects:
+ aspect_scores = [s[aspect]['score'] for s in scores]
+ improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]
+
+ results[aspect] = {
+ "diminishing_returns_point": next((i for i, imp in enumerate(improvements) if imp <= 0), len(improvements)),
+ "cumulative_improvement": sum(improvements),
+ }
+
+ return results
+```
+
+
+This function:
+- Assesses the improvement of summaries across iterations
+- Key metrics:
+ - Point of diminishing returns (where improvements become negative or zero)
+ - Cumulative improvement for each aspect
+
+Value:
+- Helps optimize the number of iterations in the Chain of Density process
+- Determines when further iterations may no longer yield significant improvements
+
+---
+
+```python
+@weave.op()
+def find_optimal_improvement_range(scores):
+ if len(scores) < 3:
+ return {aspect: {"optimal_range_start": 0, "optimal_range_end": 0, "score_at_start": 0, "score_at_end": 0, "improvement_in_range": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}
+
+ aspects = ['relevance', 'technical_quality', 'conciseness']
+ results = {}
+
+ for aspect in aspects:
+ aspect_scores = [s[aspect]['score'] for s in scores]
+ improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]
+
+ # Calculate moving average of improvements
+ window_size = min(3, len(aspect_scores) - 1)
+ moving_avg = np.convolve(improvements, np.ones(window_size), 'valid') / window_size
+
+ # Find range where improvements are above a threshold
+ threshold = 0.1 * np.mean(improvements)
+ above_threshold = [i for i, avg in enumerate(moving_avg) if avg >= threshold]
+
+ if not above_threshold:
+ optimal_start, optimal_end = 0, 0
+ else:
+ optimal_start = above_threshold[0]
+ optimal_end = above_threshold[-1] + 1
+
+ results[aspect] = {
+ "optimal_range_start": optimal_start,
+ "optimal_range_end": optimal_end,
+ "score_at_start": aspect_scores[optimal_start],
+ "score_at_end": aspect_scores[optimal_end] if optimal_end < len(aspect_scores) else aspect_scores[-1],
+ "improvement_in_range": sum(improvements[optimal_start:optimal_end])
+ }
+
+ return results
+```
+
+
+This function:
+- Determines the most effective range of iterations for improvement
+- Methodology:
+ - Uses moving average of improvements to identify sustained progress
+ - Finds optimal range where improvements are above a certain threshold
+
+Benefits:
+- Aids in fine-tuning the Chain of Density process
+- Identifies the most productive iteration range for each aspect of summary quality
+
+---
+
+```python
+@weave.op()
+def find_optimal_score_range(scores):
+ if len(scores) < 2:
+ return {aspect: {"optimal_range_start": 0, "optimal_range_end": 0, "highest_score": 0, "improvement_in_range": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}
+
+ aspects = ['relevance', 'technical_quality', 'conciseness']
+ results = {}
+
+ for aspect in aspects:
+ aspect_scores = [s[aspect]['score'] for s in scores]
+ improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]
+
+ highest_score = max(aspect_scores)
+ highest_score_index = aspect_scores.index(highest_score)
+
+ # Find the best range leading up to the highest score
+ best_start = 0
+ best_end = highest_score_index
+ best_improvement = sum(improvements[:highest_score_index])
+
+ for start in range(highest_score_index):
+ current_improvement = sum(improvements[start:highest_score_index])
+ if current_improvement > best_improvement:
+ best_start = start
+ best_improvement = current_improvement
+
+ results[aspect] = {
+ "optimal_range_start": best_start,
+ "optimal_range_end": highest_score_index,
+ "score_at_start": aspect_scores[best_start],
+ "score_at_end": highest_score,
+ "improvement_in_range": best_improvement
+ }
+
+ return results
+```
+
+
+This function:
+- Identifies the iteration range producing the highest quality summaries
+- Process:
+ - Finds the range leading up to the highest score for each aspect
+ - Considers cumulative improvement within the range
+
+Usefulness:
+- Helps understand which iterations contribute most significantly to final summary quality
+- Assists in optimizing the summarization process for maximum effectiveness
+
+---
+
+```python
+@weave.op()
+def process_iteration_summaries(model_output, instruction, model):
+ iteration_scores = [score_summary(summary, f"Iteration Summary {i+1}", instruction, model)
+ for i, summary in enumerate(model_output["iteration_summaries"])]
+ return {
+ "long_tail_stats": calculate_long_tail_stats(iteration_scores),
+ # Additional analyses can be added here if needed
+ }
+```
+
+
+This function:
+- Aggregates and analyzes scores across all summarization iterations
+- Provides:
+ - Holistic view of summary quality evolution throughout Chain of Density iterations
+ - Comprehensive analysis of the iterative summarization approach
+
+Value:
+- Helps understand overall effectiveness of the iterative process
+- Identifies trends in quality improvement across iterations
+
+---
+
+```python
+@weave.op()
+def quality_scorer(instruction, model_output, model="gpt-4o"):
+ scores = {
+ "iteration_summaries_analysis": {},
+ "accumulated_summary": {},
+ "final_summary": {}
+ }
+
+ try:
+ # Process iteration summaries
+ scores["iteration_summaries_analysis"] = process_iteration_summaries(model_output, instruction, model)
+
+ # Score accumulated summary
+ scores["accumulated_summary"] = score_summary(model_output["accumulated_summary"], "Accumulated Summary", instruction, model)
+
+ # Score final summary
+ scores["final_summary"] = score_summary(model_output["final_summary"], "Final Summary", instruction, model)
+
+ # Flatten the scores dictionary for easier analysis
+ flattened_scores = {}
+ for key, value in scores.items():
+ if isinstance(value, dict):
+ flattened_scores[key] = flatten_dict(value)
+ else:
+ flattened_scores[key] = value
+
+ scores = flatten_dict(flattened_scores)
+
+ except Exception as e:
+ print(f"Error in quality_scorer: {str(e)}")
+ scores["error"] = str(e)
+
+ return scores
+```
+
+
+This function:
+- Serves as the main entry point for evaluating summarization quality
+- Features:
+ - Combines all previous metrics into a comprehensive evaluation
+ - Analyzes iteration summaries, accumulated summary, and final summary
+
+Benefits:
+- Provides a detailed, multi-faceted assessment of the summarization pipeline's performance
+- Offers insights into various aspects of summary quality
+- Evaluates the effectiveness of the Chain of Density process as a whole
+
+---
+
+These evaluation metrics collectively provide a robust framework for assessing the quality and effectiveness of our Chain of Density summarization pipeline. By examining multiple aspects of summary quality across different stages of the process, we can gain valuable insights into the strengths and weaknesses of our approach, identify areas for improvement, and optimize the summarization process for maximum effectiveness.
+
+## Running the Evaluation
+
+Finally, we set up and run the evaluation:
+
+
+
+```python
+models = [
+ "claude-3-opus-20240229",
+ "claude-3-haiku-20240307",
+ "claude-3-5-sonnet-20240620"
+]
+
+evaluation = weave.Evaluation(dataset=dataset, scorers=[quality_scorer])
+for model in models:
+ arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline(model=model, density_iterations=8)
+ await evaluation.evaluate(arxiv_chain_of_density_pipeline)
+```
+
+This code sets up a Weave Evaluation object and runs the evaluation for each model in our list.
+
+## Optional: Advanced Chunking Technique
+
+
+Click to Expand
+
+The cookbook also includes an optional section on an advanced chunking technique to handle longer documents more effectively:
+
+### Chunking
+
+1. `chunk_text`: Splits the input text into manageable chunks, handling special cases like image descriptions.
+
+```python
+@weave.op()
+def chunk_text(text, chunk_size):
+ chunks = []
+ current_chunk = ""
+ lines = text.split('\n')
+
+ i = 0
+ while i < len(lines):
+ line = lines[i]
+ # If adding this line would exceed the chunk size, start a new chunk
+ if len(current_chunk) + len(line) > chunk_size:
+ if current_chunk:
+ chunks.append(current_chunk.strip())
+ current_chunk = ""
+
+ current_chunk += line + "\n"
+
+ # Special handling for image descriptions
+ if line.startswith("[Image Descriptions for page"):
+ if current_chunk.strip():
+ chunks.append(current_chunk.strip())
+ current_chunk = ""
+
+ # Collect all lines of the image description
+ image_descriptions = line + "\n"
+ i += 1
+ while i < len(lines) and not lines[i].startswith("[END OF IMAGE DESCRIPTIONS]"):
+ image_descriptions += lines[i] + "\n"
+ i += 1
+ if i < len(lines):
+ image_descriptions += lines[i] + "\n"
+
+ # Add the entire image description as a separate chunk
+ chunks.append(image_descriptions.strip())
+ current_chunk = ""
+ else:
+ i += 1
+
+ # Add any remaining text as the last chunk
+ if current_chunk:
+ chunks.append(current_chunk.strip())
+
+ # Combine smaller chunks to reach the desired chunk size
+ combined_chunks = []
+ current_combined_chunk = ""
+ for chunk in chunks:
+ if len(current_combined_chunk) + len(chunk) <= chunk_size:
+ current_combined_chunk += chunk + "\n\n"
+ else:
+ if current_combined_chunk:
+ combined_chunks.append(current_combined_chunk.strip())
+ current_combined_chunk = chunk + "\n\n"
+
+ if current_combined_chunk:
+ combined_chunks.append(current_combined_chunk.strip())
+
+ return combined_chunks
+```
+
+2. `summarize_chunk`: Summarizes an individual chunk, focusing on the given instruction and incorporating previous summary information.
+
+```python
+@weave.op()
+def summarize_chunk(chunk, instruction, current_summary="", iteration=1, model="claude-3-5-sonnet-20240620"):
+ # Construct a prompt for summarizing the chunk
+ prompt = f"""Current summary:
+ {current_summary}
+
+ New information:
+ {chunk}
+
+ Instruction to focus on: {instruction}
+
+ Iteration: {iteration}
+
+ Create an extremely dense, highly technical summary that specifically addresses the given instruction. Follow these steps:
+
+ 1. Identify 3-5 key technical points from the new information that are directly relevant to the instruction, prioritizing:
+ - Novel methodologies or algorithms related to the instruction
+ - Specific quantitative results or metrics that address the instruction
+ - Detailed experimental setups or parameters pertinent to the instruction
+ - Precise definitions of domain-specific concepts mentioned in the instruction
+ - Critical limitations or assumptions in the research that affect the instruction
+
+ 1. Integrate these points with the current summary, ensuring:
+ - Direct relevance to the instruction at hand
+ - No redundancy or oversimplification
+ - Preservation of technical nuances and complexities specific to the instruction
+ - Inclusion of relevant equations, formulas, or mathematical notations that help address the instruction
+ - Accurate representation of statistical significance and error margins for instruction-related data
+
+ 1. Rephrase the combined information to maximize information density while maintaining focus on the instruction:
+ - Use domain-specific terminology and jargon without simplification, as relevant to the instruction
+ - Maintain the level of detail expected in a PhD-level discourse on the specific topic of the instruction
+ - Incorporate precise citations or references where applicable to support the response
+ - Preserve any conflicting viewpoints or ongoing debates in the field that relate to the instruction
+
+ 1. With each iteration, aim to increase information density by 30-40% without sacrificing technical accuracy or critical details that address the instruction.
+
+ 2. Ensure the summary includes instruction-specific:
+ - Methodological details (e.g., exact algorithms, parameter settings) that are crucial to addressing the instruction
+ - Precise quantitative results with appropriate units and error bounds that directly relate to the instruction
+ - Detailed descriptions of novel techniques or approaches that are key to addressing the instruction
+ - Critical analysis of strengths and limitations in the research as they pertain to the instruction
+
+ Produce a summary that is significantly more information-dense and technically precise than the previous one, while remaining laser-focused on addressing the given instruction. Use language appropriate for a highly specialized audience in the field."""
+
+ # Use the Anthropic API to generate the summary
+ response = anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[{"role": "user", "content": prompt}]
+ )
+ return response.content[0].text
+```
+
+3. `summarize_chunk_summaries`: Combines summaries from multiple chunks into a coherent whole.
+
+```python
+@weave.op()
+def summarize_chunk_summaries(instruction, current_summary, chunk_summaries, model="claude-3-opus-20240229"):
+ # Construct a prompt for combining chunk summaries
+ prompt = f"""Given this current summary:
+
+ {current_summary}
+
+ And these chunk summaries:
+
+ {' '.join(chunk_summaries)}
+
+ And this instruction to focus on:
+
+ {instruction}
+
+ Create an extremely dense, final summary that refines the current summary by incorporating key information from the chunk summaries, while specifically addressing the given instruction. Follow these guidelines:
+
+ 1. Integrate the most relevant and important information from the chunk summaries into the current summary.
+ 2. Ensure all key technical content from both the current summary and chunk summaries that relates to the instruction is retained.
+ 3. Aim to reduce overall length by 30-40% while increasing information density.
+ 4. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.
+ 5. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.
+ 6. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.
+ 7. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.
+ 8. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints from across all summaries.
+ 9. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.
+
+ The final summary should be a highly concentrated, technical distillation of all provided summaries that specifically addresses the given instruction, suitable for specialists in the field."""
+
+ # Use the Anthropic API to generate the combined summary
+ return anthropic_client.messages.create(
+ model=model,
+ max_tokens=4096,
+ messages=[{"role": "user", "content": prompt}],
+ ).content[0].text
+```
+
+4. `summarize_chunk_iteration`: Manages the process of summarizing all chunks in a single iteration.
+
+```python
+@weave.op()
+def summarize_chunk_iteration(chunks, instruction, current_summary, iteration, model):
+ chunk_summaries = []
+ # Summarize each chunk individually
+ for i, chunk in enumerate(chunks, 1):
+ current_summary = summarize_chunk(chunk, instruction, current_summary, iteration, model)
+ chunk_summaries.append(current_summary)
+ print(f"Iteration {iteration}, Chunk {i}:\n{current_summary}\n")
+ # Combine all chunk summaries into a single summary
+ current_summary = summarize_chunk_summaries(instruction, current_summary, chunk_summaries, model)
+ print(f"Iteration {iteration}, Final Summary:\n{current_summary}\n")
+ return current_summary, chunk_summaries
+```
+
+5. `iterative_chunk_summarization`: Performs multiple iterations of chunk-based summarization.
+
+```python
+@weave.op()
+def iterative_chunk_summarization(chunks, instruction, current_summary, chunk_iterations, model):
+ chunk_iteration_summaries = []
+ chunk_summaries = []
+ # Perform multiple iterations of chunk summarization
+ for iteration in range(1, chunk_iterations + 1):
+ current_summary, iteration_chunk_summaries = summarize_chunk_iteration(chunks, instruction, current_summary, iteration, model)
+ chunk_iteration_summaries.append(current_summary)
+ chunk_summaries.append(iteration_chunk_summaries)
+ return current_summary, chunk_iteration_summaries, chunk_summaries
+```
+
+6. `chain_of_density_summarization`: Orchestrates the entire summarization process, including both chunk-based and density-based summarization steps.
+
+
+
+```python
+@weave.op()
+def chain_of_density_summarization(instruction, text, model="claude-3-5-sonnet-20240620", chunk_size=8192, chunk_iterations=2, density_iterations=2):
+ # Split the text into chunks
+ chunks = chunk_text(text, chunk_size)
+ print(f"Number of chunks: {len(chunks)}")
+ print(f"Chunk sizes: {[len(chunk) for chunk in chunks]}")
+
+ # Perform chunk-based summarization
+ current_summary, chunk_iteration_summaries, chunk_summaries = iterative_chunk_summarization(chunks, instruction, "", chunk_iterations, model)
+
+ # Perform final density-based summarization
+ current_summary, iteration_summaries = iterative_density_summarization(instruction, current_summary, density_iterations, model)
+ final_summary_text = final_summary(instruction, current_summary, model)
+ print(f"Final Summary:\n{final_summary_text}\n")
+
+ # Return all intermediate and final results
+ return {
+ "final_summary": final_summary_text,
+ "accumulated_summary": current_summary,
+ "iteration_summaries": iteration_summaries,
+ "chunk_iteration_summaries": chunk_iteration_summaries,
+ "chunk_summaries": chunk_summaries
+ }
+```
+
+This advanced chunking technique allows for more effective handling of longer documents, potentially improving the quality and comprehensiveness of the final summary.
+
+### Model Evaluation
+
+> Note that the `ArxivChainOfDensityPipeline` class stays identical as the same `chain_of_density_summarization` function is replaced.
+
+## Advanced Evaluation Metrics
+
+To thoroughly assess the quality and effectiveness of our Chain of Density summarization pipeline, we implement a set of advanced evaluation metrics. These metrics provide a comprehensive analysis of the summarization process, taking into account both the chunk-based approach and the overall summary quality.
+
+### Processing Chunk Summaries
+
+The `process_chunk_summaries` function evaluates the quality of individual chunk summaries:
+
+```python
+def process_chunk_summaries(model_output, instruction, model):
+ scores = {}
+ for i, chunk_list in enumerate(model_output["chunk_summaries"]):
+ chunk_summary_scores = []
+ for j, summary in enumerate(chunk_list):
+ chunk_summary_score = score_summary(summary, f"Chunk Summary {i+1}.{j+1}", instruction, model)
+ chunk_summary_scores.append(chunk_summary_score)
+
+ scores[f"chunk_summaries_analysis_{i+1}"] = {
+ "long_tail_stats": calculate_long_tail_stats(chunk_summary_scores),
+ "iteration_impact": analyze_iteration_impact(chunk_summary_scores),
+ "optimal_improvement_range": find_optimal_improvement_range(chunk_summary_scores),
+ "optimal_score_range": find_optimal_score_range(chunk_summary_scores)
+ }
+ return scores
+```
+
+This function:
+- Scores each chunk summary individually
+- Calculates various statistics for each chunk iteration, including long-tail stats, iteration impact, and optimal improvement ranges
+
+### Processing Chunk Iteration Summaries
+
+The `process_chunk_iteration_summaries` function evaluates the quality of summaries produced after each chunk iteration:
+
+```python
+def process_chunk_iteration_summaries(model_output, instruction, model):
+ chunk_iteration_scores = [
+ score_summary(summary, f"Chunk Iteration Summary {i+1}", instruction, model)
+ for i, summary in enumerate(model_output["chunk_iteration_summaries"])
+ ]
+
+ return {
+ "long_tail_stats": calculate_long_tail_stats(chunk_iteration_scores),
+ "iteration_impact": analyze_iteration_impact(chunk_iteration_scores),
+ "optimal_improvement_range": find_optimal_improvement_range(chunk_iteration_scores),
+ "optimal_score_range": find_optimal_score_range(chunk_iteration_scores)
+ }
+```
+
+This function:
+- Scores each chunk iteration summary
+- Calculates aggregate statistics across all chunk iterations
+
+### Quality Scorer
+
+The `quality_scorer` function serves as the main entry point for our evaluation process:
+
+```python
+@weave.op()
+def quality_scorer(instruction, model_output, model="gpt-4o"):
+ scores = {
+ "chunk_summaries_analysis": {},
+ "chunk_iteration_summaries_analysis": {},
+ "iteration_summaries_analysis": {},
+ "accumulated_summary": {},
+ "final_summary": {}
+ }
+
+ try:
+ chunk_summaries_scores = process_chunk_summaries(model_output, instruction, model)
+ scores.update(chunk_summaries_scores)
+
+ scores["chunk_iteration_summaries_analysis"] = process_chunk_iteration_summaries(model_output, instruction, model)
+ scores["iteration_summaries_analysis"] = process_iteration_summaries(model_output, instruction, model)
+ scores["accumulated_summary"] = score_summary(model_output["accumulated_summary"], "Accumulated Summary", instruction, model)
+ scores["final_summary"] = score_summary(model_output["final_summary"], "Final Summary", instruction, model)
+
+ flattened_scores = {}
+ for key, value in scores.items():
+ if isinstance(value, dict):
+ flattened_scores[key] = flatten_dict(value)
+ else:
+ flattened_scores[key] = value
+
+ scores = flatten_dict(flattened_scores)
+
+ except Exception as e:
+ print(f"Error in quality_scorer: {str(e)}")
+ scores["error"] = str(e)
+
+ return scores
+```
+
+This function:
+- Orchestrates the entire evaluation process
+- Processes and scores chunk summaries, chunk iteration summaries, and the final summary
+- Flattens the nested score dictionary for easier analysis
+- Handles any errors that occur during the scoring process
+
+By implementing these advanced evaluation metrics, we can gain deep insights into the performance of our Chain of Density summarization pipeline at various stages of the process. This allows us to identify areas for improvement and optimize our approach for maximum effectiveness.
+
+
+
+## Conclusion
+
+This cookbook has demonstrated the implementation of an advanced AI-powered summarization bot using the Chain of Density technique. By leveraging Anthropic's Claude API, the Arxiv API, and Weave for experiment tracking, we've created a powerful tool for generating concise, information-dense summaries of scientific papers.
+
+Key takeaways:
+1. The Chain of Density technique allows for iterative refinement of summaries, increasing information density while maintaining relevance to specific instructions.
+2. Our implementation handles both textual content and visual elements (images and vector graphics) from PDF papers, ensuring comprehensive coverage of research content.
+3. The optional advanced chunking technique enables effective processing of longer documents, improving summary quality for extensive research papers.
+4. Robust evaluation metrics provide insights into the summarization process, allowing for continuous improvement and optimization.
+
+Potential applications:
+- Rapid literature review for researchers
+- Automated creation of paper abstracts or extended summaries
+- Assisting in the peer review process by providing concise overviews of submissions
+- Enhancing search and discovery of relevant research papers
+
+By combining state-of-the-art language models with carefully crafted prompts and evaluation techniques, this summarization pipeline demonstrates the potential for AI to significantly accelerate and enhance scientific research processes.
diff --git a/colabs/anthropic/summarization/media/.gitignore b/colabs/anthropic/summarization/media/.gitignore
new file mode 100644
index 00000000..4233b020
--- /dev/null
+++ b/colabs/anthropic/summarization/media/.gitignore
@@ -0,0 +1 @@
+*.screenstudio
diff --git a/colabs/anthropic/summarization/media/arxiv_paper.gif b/colabs/anthropic/summarization/media/arxiv_paper.gif
new file mode 100644
index 00000000..ae746acf
Binary files /dev/null and b/colabs/anthropic/summarization/media/arxiv_paper.gif differ
diff --git a/colabs/anthropic/summarization/media/chain_of_density.gif b/colabs/anthropic/summarization/media/chain_of_density.gif
new file mode 100644
index 00000000..02da71fa
Binary files /dev/null and b/colabs/anthropic/summarization/media/chain_of_density.gif differ
diff --git a/colabs/anthropic/summarization/media/eval_comparison.gif b/colabs/anthropic/summarization/media/eval_comparison.gif
new file mode 100644
index 00000000..df0b7c15
Binary files /dev/null and b/colabs/anthropic/summarization/media/eval_comparison.gif differ
diff --git a/colabs/anthropic/summarization/media/eval_dataset.gif b/colabs/anthropic/summarization/media/eval_dataset.gif
new file mode 100644
index 00000000..c68db6d5
Binary files /dev/null and b/colabs/anthropic/summarization/media/eval_dataset.gif differ
diff --git a/colabs/anthropic/summarization/media/evals_main_screen.gif b/colabs/anthropic/summarization/media/evals_main_screen.gif
new file mode 100644
index 00000000..8739b474
Binary files /dev/null and b/colabs/anthropic/summarization/media/evals_main_screen.gif differ
diff --git a/colabs/anthropic/summarization/media/fetch_arxiv_papers.gif b/colabs/anthropic/summarization/media/fetch_arxiv_papers.gif
new file mode 100644
index 00000000..bf8d44be
Binary files /dev/null and b/colabs/anthropic/summarization/media/fetch_arxiv_papers.gif differ
diff --git a/colabs/anthropic/summarization/media/generate_arxiv_query_args.gif b/colabs/anthropic/summarization/media/generate_arxiv_query_args.gif
new file mode 100644
index 00000000..c51ca5e9
Binary files /dev/null and b/colabs/anthropic/summarization/media/generate_arxiv_query_args.gif differ
diff --git a/colabs/anthropic/summarization/media/model.gif b/colabs/anthropic/summarization/media/model.gif
new file mode 100644
index 00000000..6c19afbe
Binary files /dev/null and b/colabs/anthropic/summarization/media/model.gif differ
diff --git a/colabs/anthropic/summarization/media/model_extract_images.gif b/colabs/anthropic/summarization/media/model_extract_images.gif
new file mode 100644
index 00000000..682d9c34
Binary files /dev/null and b/colabs/anthropic/summarization/media/model_extract_images.gif differ
diff --git a/colabs/anthropic/summarization/media/model_extract_images.mp4 b/colabs/anthropic/summarization/media/model_extract_images.mp4
new file mode 100644
index 00000000..8405ae86
Binary files /dev/null and b/colabs/anthropic/summarization/media/model_extract_images.mp4 differ
diff --git a/colabs/anthropic/summarization/media/model_replace_image_with_descriptions.gif b/colabs/anthropic/summarization/media/model_replace_image_with_descriptions.gif
new file mode 100644
index 00000000..78f355f8
Binary files /dev/null and b/colabs/anthropic/summarization/media/model_replace_image_with_descriptions.gif differ
diff --git a/colabs/anthropic/summarization/requirements.txt b/colabs/anthropic/summarization/requirements.txt
new file mode 100644
index 00000000..444eb2c9
--- /dev/null
+++ b/colabs/anthropic/summarization/requirements.txt
@@ -0,0 +1,12 @@
+arxiv
+anthropic
+python-dotenv
+requests
+tqdm
+PyPDF2
+weave
+filetype
+Pillow
+pdf2image
+openai
+numpy