Spaces:
Running
Running
update Haystack and simplify
Browse files- backend/custom_components.py +2 -85
- backend/pipelines.py +3 -2
- requirements.txt +1 -2
backend/custom_components.py
CHANGED
@@ -1,93 +1,10 @@
|
|
1 |
-
from
|
2 |
-
from typing import Any, Dict, List, Optional, Union
|
3 |
|
4 |
-
from haystack import
|
5 |
-
from haystack.components.converters.utils import (
|
6 |
-
get_bytestream_from_source,
|
7 |
-
normalize_metadata,
|
8 |
-
)
|
9 |
-
from haystack.dataclasses import ByteStream
|
10 |
-
|
11 |
-
from trafilatura import extract
|
12 |
|
13 |
import json
|
14 |
import json_repair
|
15 |
|
16 |
-
logger = logging.getLogger(__name__)
|
17 |
-
|
18 |
-
|
19 |
-
@component
|
20 |
-
class TrafilaturaHTMLConverter:
|
21 |
-
"""
|
22 |
-
Converts an HTML file to a Document using Trafilatura.
|
23 |
-
|
24 |
-
Usage example:
|
25 |
-
```python
|
26 |
-
converter = TrafilaturaHTMLConverter()
|
27 |
-
results = converter.run(sources=["path/to/sample.html"])
|
28 |
-
documents = results["documents"]
|
29 |
-
print(documents[0].content)
|
30 |
-
# 'This is a text from the HTML file.'
|
31 |
-
```
|
32 |
-
"""
|
33 |
-
|
34 |
-
@component.output_types(documents=List[Document])
|
35 |
-
def run(
|
36 |
-
self,
|
37 |
-
sources: List[Union[str, Path, ByteStream]],
|
38 |
-
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
39 |
-
):
|
40 |
-
"""
|
41 |
-
Converts a list of HTML files to Documents.
|
42 |
-
|
43 |
-
:param sources:
|
44 |
-
List of HTML file paths or ByteStream objects.
|
45 |
-
:param meta:
|
46 |
-
Optional metadata to attach to the Documents.
|
47 |
-
This value can be either a list of dictionaries or a single dictionary.
|
48 |
-
If it's a single dictionary, its content is added to the metadata of all produced Documents.
|
49 |
-
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
|
50 |
-
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
|
51 |
-
:param extract_kwargs:
|
52 |
-
Additional keyword arguments to pass to the Trafilatura `extract` method.
|
53 |
-
See the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for more information.
|
54 |
-
|
55 |
-
:returns:
|
56 |
-
A dictionary with the following keys:
|
57 |
-
- `documents`: Created Documents
|
58 |
-
"""
|
59 |
-
|
60 |
-
documents = []
|
61 |
-
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
|
62 |
-
|
63 |
-
for source, metadata in zip(sources, meta_list):
|
64 |
-
try:
|
65 |
-
bytestream = get_bytestream_from_source(source=source)
|
66 |
-
except Exception as e:
|
67 |
-
logger.warning(
|
68 |
-
"Could not read {source}. Skipping it. Error: {error}",
|
69 |
-
source=source,
|
70 |
-
error=e,
|
71 |
-
)
|
72 |
-
continue
|
73 |
-
|
74 |
-
text = None
|
75 |
-
try:
|
76 |
-
text = extract(bytestream.data.decode("utf-8"))
|
77 |
-
except Exception as conversion_e:
|
78 |
-
logger.warning(
|
79 |
-
"Failed to extract text from {source}. Error: {error}",
|
80 |
-
source=source,
|
81 |
-
error=conversion_e,
|
82 |
-
)
|
83 |
-
continue
|
84 |
-
|
85 |
-
document = Document(content=text, meta={**bytestream.meta, **metadata})
|
86 |
-
documents.append(document)
|
87 |
-
|
88 |
-
return {"documents": documents}
|
89 |
-
|
90 |
-
|
91 |
@component
|
92 |
class QuizParser:
|
93 |
@component.output_types(quiz=Dict)
|
|
|
1 |
+
from typing import Dict, List
|
|
|
2 |
|
3 |
+
from haystack import component
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
import json
|
6 |
import json_repair
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
@component
|
9 |
class QuizParser:
|
10 |
@component.output_types(quiz=Dict)
|
backend/pipelines.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
from .custom_components import
|
|
|
2 |
from haystack.components.fetchers import LinkContentFetcher
|
3 |
from haystack.components.generators import OpenAIGenerator
|
4 |
from haystack.components.builders import PromptBuilder
|
@@ -37,7 +38,7 @@ text:
|
|
37 |
|
38 |
quiz_generation_pipeline = Pipeline()
|
39 |
quiz_generation_pipeline.add_component("link_content_fetcher", LinkContentFetcher())
|
40 |
-
quiz_generation_pipeline.add_component("html_converter",
|
41 |
quiz_generation_pipeline.add_component(
|
42 |
"prompt_builder", PromptBuilder(template=quiz_generation_template)
|
43 |
)
|
|
|
1 |
+
from .custom_components import QuizParser
|
2 |
+
from haystack.components.converters import HTMLToDocument
|
3 |
from haystack.components.fetchers import LinkContentFetcher
|
4 |
from haystack.components.generators import OpenAIGenerator
|
5 |
from haystack.components.builders import PromptBuilder
|
|
|
38 |
|
39 |
quiz_generation_pipeline = Pipeline()
|
40 |
quiz_generation_pipeline.add_component("link_content_fetcher", LinkContentFetcher())
|
41 |
+
quiz_generation_pipeline.add_component("html_converter", HTMLToDocument())
|
42 |
quiz_generation_pipeline.add_component(
|
43 |
"prompt_builder", PromptBuilder(template=quiz_generation_template)
|
44 |
)
|
requirements.txt
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
haystack-ai==2.
|
2 |
-
trafilatura
|
3 |
json-repair
|
4 |
gradio
|
|
|
1 |
+
haystack-ai==2.2.0
|
|
|
2 |
json-repair
|
3 |
gradio
|