File size: 6,353 Bytes
3578b4b
 
 
 
 
 
 
 
 
 
 
b6feec9
3578b4b
0a06e99
 
 
 
 
 
 
 
 
3578b4b
0a06e99
 
 
 
 
 
 
 
3578b4b
0a06e99
 
 
 
 
 
 
 
 
 
 
 
 
 
b6feec9
 
0a06e99
 
 
 
 
b6feec9
 
 
 
0a06e99
b6feec9
0a06e99
b6feec9
 
0a06e99
 
 
 
 
 
 
 
b6feec9
0a06e99
 
 
 
 
 
 
 
 
b6feec9
0a06e99
 
b6feec9
 
 
 
0a06e99
b6feec9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a06e99
 
 
 
3578b4b
 
 
 
 
53610f9
0a06e99
3578b4b
b6feec9
0a06e99
 
3578b4b
 
 
0a06e99
3578b4b
 
 
 
 
 
 
 
0a06e99
3578b4b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
import os
import openai
from newspaper import Article
import json
import re
from transformers import GPT2Tokenizer
import requests


# define the text summarizer function
def text_prompt(request, system_role, page_urls_str, api_key, api_base, deployment_id, temp):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    page_urls = [page_url_str for page_url_str in page_urls_str.split("\n") if page_url_str]
    if len(page_urls) == 0:
        return "", "urls not found", ""

    page_texts = []
    response_texts = []
    total_tokens = 0
    for page_url in page_urls:
        try:
            headers = {'User-Agent': 'Chrome/83.0.4103.106'}
            response = requests.get(page_url, headers=headers)
            html = response.text

            page = Article('')
            page.set_html(html)
            page.parse()

        except Exception as e:
            return "", f"--- An error occurred while processing the URL: {e} ---", ""

        sentences = page.text.split('.')

        tokens = []
        page_text = ""

        for sentence in sentences:
            tokens.extend(tokenizer.tokenize(sentence))

            # Trim text to a maximum of 3100 tokens
            if len(tokens) > 3100:
                break
            page_text += sentence + ". "
        tokens.extend(tokenizer.tokenize(request))
        tokens.extend(tokenizer.tokenize(system_role))
        # Delete the last space
        page_text = page_text.strip()

        num_tokens = len(tokens)

        tokens_condition = num_tokens > 10
        api_key_condition = len(api_key) > 6
        deployment_id_condition = len(deployment_id) > 6
        if tokens_condition and api_key_condition and deployment_id_condition:
            openai.api_type = "azure"
            openai.api_version = "2023-05-15"
            openai.api_base = api_base
            openai.api_key = api_key
            max_tokens = 4000 - num_tokens  # TODO: change 4096 to a dictionary with the max tokens for each deploymend_id
            # get the response from openai API
            try:
                response = openai.ChatCompletion.create(
                    deployment_id=deployment_id,
                    messages=[
                        {"role": "system", "content": system_role},
                        {"role": "user", "content": request + "\n\n" + 'Text:\n\n""""' + page_text + '\n""""'}
                    ],
                    max_tokens=max_tokens,
                    temperature=temp,
                    top_p=1.0,
                )
                # get the response text
                response_text = response['choices'][0]['message']['content']
                total_tokens += response["usage"]["total_tokens"]

                # clean the response text
                response_text = re.sub(r'\s+', ' ', response_text)
                response_text = f"#### [{page.title}]({page_url})\n\n{response_text.strip()}\n"

            except Exception as e:
                response_text = f"#### [{page.title}]({page_url})\n\n"
                response_text += f"--- An error occurred while processing the request: {e} ---\n"
            page_texts.append(page.text)
            response_texts.append(response_text)

        else:
            page_text_temp = "ERROR:\n\n"
            if page.text:
                page_text_temp += page.text

            response_text_temp = "#### "
            if page.title:
                response_text_temp += f"[{page.title}]({page_url})"
            if not tokens_condition:
                response_text_temp += "\n\nERROR: Tokens problems! Maybe it can't read the URL. "
            if not api_key_condition:
                response_text_temp += "\n\nERROR: API Key problems! Copy and paste the API Key (be careful with copying spaces at the beginning or end of the API Key). "
            if not deployment_id_condition:
                response_text_temp += "\n\nERROR: Deployment_id problems! Copy and paste the deployment_id (be careful with copying spaces at the beginning or end of the deployment_id). "

            page_texts.append(page_text_temp)
            response_texts.append(response_text_temp)

    page_texts_str = "".join([f"====== NEW URL: {URL} ======\n{page_text}\n\n" for page_text, URL in zip(page_texts, page_urls)])
    response_texts_str = "\n\n".join([response_text for response_text in response_texts])
    total_tokens_str = str(total_tokens) + " (${:.2f} USD)".format(total_tokens / 1000 * 0.03)
    return page_texts_str, response_texts_str, total_tokens_str


# define the gradio interface
iface = gr.Interface(
    fn=text_prompt,
    inputs=[gr.Textbox(lines=1, placeholder="Enter your prompt here...", label="Prompt:", type="text"),
            gr.Textbox(lines=1, placeholder="Enter your system-role description here...", label="System Role:", type="text"),
            gr.Textbox(lines=10, placeholder="Enter the Articles' URLs here...", label="Articles' URLs to parse (one per line up to 10):", type="text"),
            gr.Textbox(lines=1, placeholder="Enter your API-key here...", label="API-Key:", type="password"),
            gr.Textbox(lines=1, placeholder="Enter your Azure OpenAI API base here...", label="Enter Azure API base (Endpoint):", type="text"),
            gr.Textbox(lines=1, placeholder="Enter your model name here...", label="Deployment ID:", type="text"),
            gr.Slider(0.0, 1.0, value=0.0, label="Temperature:")
            ],
    outputs=[gr.Textbox(label="Input:"), gr.Markdown(label="Output:"), gr.Markdown(label="Total Tokens:")],
    title="ChatGPT info extraction from URL",
    description="This tool allows querying the text retrieved from the URL with newspaper3k lib and using MSFT Azure OpenAI's [gpt-3.5-turbo] engine.\nThe URL text can be referenced in the prompt as \"following text\".\nA GPT2 tokenizer is included to ensure that the 1.800 token limit for OpenAI queries is not exceeded. Provide a prompt with your request, the description for the system role, the url for text retrieval, your api-key and temperature to process the text."
)

# error capturing in integration as a component

error_message = ""

try:
    iface.queue(concurrency_count=20)
    iface.launch(debug=True)
except Exception as e:
    error_message = "An error occurred: " + str(e)
    iface.outputs[1].value = error_message