File size: 5,647 Bytes
cbb225c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40a0d94
 
 
 
 
 
 
 
cbb225c
 
 
 
 
 
 
2016f17
cbb225c
40a0d94
 
 
 
 
 
 
 
cbb225c
 
 
 
 
 
 
 
 
 
 
 
 
40a0d94
 
 
 
 
 
 
cbb225c
 
 
 
 
 
 
 
 
 
 
 
40a0d94
 
 
 
 
 
 
cbb225c
 
 
 
 
 
 
 
 
40a0d94
 
 
 
 
 
 
cbb225c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40a0d94
 
 
 
 
 
 
 
 
cbb225c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Util that calls Wikipedia. references: https://github.com/hwchase17/langchain/blob/9b615022e2b6a3591347ad77a3e21aad6cf24c49/docs/extras/modules/agents/tools/integrations/wikipedia.ipynb#L36"""
import logging
from typing import Any, Dict, List, Optional

from pydantic import BaseModel, root_validator

logger = logging.getLogger(__name__)

WIKIPEDIA_MAX_QUERY_LENGTH = 300


class WikipediaAPIWrapper(BaseModel):
    """Wrapper around WikipediaAPI.

    To use, you should have the ``wikipedia`` python package installed.
    This wrapper will use the Wikipedia API to conduct searches and
    fetch page summaries. By default, it will return the page summaries
    of the top-k results.
    It limits the Document content by doc_content_chars_max.
    
    :param top_k_results: The number of results to return.
    :type top_k_results: int
    :param lang: The language to use for the Wikipedia API.
    :type lang: str
    :param doc_content_chars_max: The maximum number of characters in the Document content.
    :type doc_content_chars_max: int
    :wiki_client: The Wikipedia API client.
    """

    wiki_client: Any
    top_k_results: int = 5
    lang: str = "en"
    doc_content_chars_max: int = 4000

    @root_validator(pre=True)
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the python package exists in environment.
        
        :param values: The values to validate.
        :type values: Dict
        :return: The validated values.
        :rtype: Dict
        :raises ImportError: If the package is not installed.
        """
        try:
            import wikipedia

            wikipedia.set_lang(values["lang"])
            values["wiki_client"] = wikipedia
        except ImportError:
            raise ImportError(
                "Could not import wikipedia python package. "
                "Please install it with `pip install wikipedia`."
            )
        return values

    def run(self, query: str) -> str:
        """Run Wikipedia search and get page summaries.
        
        :param query: The query to search for.
        :type query: str
        :return: The page summaries.
        :rtype: str
        """

        page_titles = self.search_page_titles(query)
        summaries = []
        for page_title in page_titles:
            if wiki_page := self._fetch_page(page_title):
                if summary := self._formatted_page_summary(page_title, wiki_page):
                    summaries.append(summary)
        if not summaries:
            return "No good Wikipedia Search Result was found"
        return "\n\n".join(summaries)[: self.doc_content_chars_max]

    def _fetch_page(self, page: str) -> Optional[str]:
        """ Fetch page content from Wikipedia. 
        
        :param page: The page to fetch.
        :type page: str
        :return: The page content.
        :rtype: Optional[str]
        """
        try:
            return self.wiki_client.page(title=page, auto_suggest=False).content[: self.doc_content_chars_max]
        except (
            self.wiki_client.exceptions.PageError,
            self.wiki_client.exceptions.DisambiguationError,
        ):
            return None

    def search_page_titles(self, query: str) -> List[str]:
        """Run Wikipedia search and get page summaries.
        
        :param query: The query to search for.
        :type query: str
        :return: The page titles.
        :rtype: List[str]
        """

        return self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH])[:self.top_k_results]


    # def _page_to_document(self, page_title: str, wiki_page: Any) -> Document:
    #     main_meta = {
    #         "title": page_title,
    #         "summary": wiki_page.summary,
    #         "source": wiki_page.url,
    #     }
    #     add_meta = (
    #         {
    #             "categories": wiki_page.categories,
    #             "page_url": wiki_page.url,
    #             "image_urls": wiki_page.images,
    #             "related_titles": wiki_page.links,
    #             "parent_id": wiki_page.parent_id,
    #             "references": wiki_page.references,
    #             "revision_id": wiki_page.revision_id,
    #             "sections": wiki_page.sections,
    #         }
    #         if self.load_all_available_meta
    #         else {}
    #     )
    #     doc = Document(
    #         page_content=wiki_page.content[: self.doc_content_chars_max],
    #         metadata={
    #             **main_meta,
    #             **add_meta,
    #         },
    #     )
    #     return doc

    @staticmethod
    def _formatted_page_summary(page_title: str, wiki_page: Any) -> Optional[str]:
        """ Format the page and summary in a single string. 
        
        :param page_title: The page title.
        :type page_title: str
        :param wiki_page: The Wikipedia page.
        :type wiki_page: Any
        :return: The formatted page summary.
        :rtype: Optional[str]
        """
        return f"Page: {page_title}\nSummary: {wiki_page.summary}"

    # def load(self, query: str) -> List[Document]:
    #     """
    #     Run Wikipedia search and get the article text plus the meta information.
    #     See
    #
    #     Returns: a list of documents.
    #
    #     """
    #     page_titles = self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH])
    #     docs = []
    #     for page_title in page_titles[: self.top_k_results]:
    #         if wiki_page := self._fetch_page(page_title):
    #             if doc := self._page_to_document(page_title, wiki_page):
    #                 docs.append(doc)
    #     return docs