Spaces:

deepset
/

autoquizzer

Running

App Files Files Community

anakin87 commited on Jun 3

Commit

083f13e

•

1 Parent(s): 2a15409

update Haystack and simplify

Browse files

Files changed (3) hide show

backend/custom_components.py +2 -85
backend/pipelines.py +3 -2
requirements.txt +1 -2

backend/custom_components.py CHANGED Viewed

@@ -1,93 +1,10 @@
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
-from haystack import Document, component, logging
-from haystack.components.converters.utils import (
-    get_bytestream_from_source,
-    normalize_metadata,
-)
-from haystack.dataclasses import ByteStream
-from trafilatura import extract
 import json
 import json_repair
-logger = logging.getLogger(__name__)
-@component
-class TrafilaturaHTMLConverter:
-    """
-    Converts an HTML file to a Document using Trafilatura.
-    Usage example:
-    ```python
-    converter = TrafilaturaHTMLConverter()
-    results = converter.run(sources=["path/to/sample.html"])
-    documents = results["documents"]
-    print(documents[0].content)
-    # 'This is a text from the HTML file.'
-    ```
-    """
-    @component.output_types(documents=List[Document])
-    def run(
-        self,
-        sources: List[Union[str, Path, ByteStream]],
-        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
-    ):
-        """
-        Converts a list of HTML files to Documents.
-        :param sources:
-            List of HTML file paths or ByteStream objects.
-        :param meta:
-            Optional metadata to attach to the Documents.
-            This value can be either a list of dictionaries or a single dictionary.
-            If it's a single dictionary, its content is added to the metadata of all produced Documents.
-            If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
-            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
-        :param extract_kwargs:
-            Additional keyword arguments to pass to the Trafilatura `extract` method.
-            See the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for more information.
-        :returns:
-            A dictionary with the following keys:
-            - `documents`: Created Documents
-        """
-        documents = []
-        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
-        for source, metadata in zip(sources, meta_list):
-            try:
-                bytestream = get_bytestream_from_source(source=source)
-            except Exception as e:
-                logger.warning(
-                    "Could not read {source}. Skipping it. Error: {error}",
-                    source=source,
-                    error=e,
-                )
-                continue
-            text = None
-            try:
-                text = extract(bytestream.data.decode("utf-8"))
-            except Exception as conversion_e:
-                logger.warning(
-                    "Failed to extract text from {source}. Error: {error}",
-                    source=source,
-                    error=conversion_e,
-                )
-                continue
-            document = Document(content=text, meta={**bytestream.meta, **metadata})
-            documents.append(document)
-        return {"documents": documents}
 @component
 class QuizParser:
     @component.output_types(quiz=Dict)

+from typing import Dict, List
+from haystack import component
 import json
 import json_repair
 @component
 class QuizParser:
     @component.output_types(quiz=Dict)

backend/pipelines.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from .custom_components import TrafilaturaHTMLConverter, QuizParser
 from haystack.components.fetchers import LinkContentFetcher
 from haystack.components.generators import OpenAIGenerator
 from haystack.components.builders import PromptBuilder
@@ -37,7 +38,7 @@ text:
 quiz_generation_pipeline = Pipeline()
 quiz_generation_pipeline.add_component("link_content_fetcher", LinkContentFetcher())
-quiz_generation_pipeline.add_component("html_converter", TrafilaturaHTMLConverter())
 quiz_generation_pipeline.add_component(
     "prompt_builder", PromptBuilder(template=quiz_generation_template)
 )

+from .custom_components import QuizParser
+from haystack.components.converters import HTMLToDocument
 from haystack.components.fetchers import LinkContentFetcher
 from haystack.components.generators import OpenAIGenerator
 from haystack.components.builders import PromptBuilder
 quiz_generation_pipeline = Pipeline()
 quiz_generation_pipeline.add_component("link_content_fetcher", LinkContentFetcher())
+quiz_generation_pipeline.add_component("html_converter", HTMLToDocument())
 quiz_generation_pipeline.add_component(
     "prompt_builder", PromptBuilder(template=quiz_generation_template)
 )

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-haystack-ai==2.1.2
-trafilatura
 json-repair
 gradio

+haystack-ai==2.2.0
 json-repair
 gradio