fix(wikipedia-streaming): convert to streaming-compatible responses

This commit is contained in:
Eric Z 2024-12-29 05:31:36 -06:00
parent 5a6e476fd9
commit 542d143a88

View File

@ -79,7 +79,10 @@ class Pipeline:
messages: List[dict], messages: List[dict],
body: dict body: dict
) -> Union[str, Generator, Iterator]: ) -> Union[str, Generator, Iterator]:
# This is where you can add your custom pipelines like RAG. """
Main pipeline function. Performs wikipedia article lookup by query
and returns the summary of the first article.
"""
logger.debug(f"pipe:{self.name}") logger.debug(f"pipe:{self.name}")
# Check if title generation is requested # Check if title generation is requested
@ -101,34 +104,64 @@ class Pipeline:
# 'user': {'name': 'User', 'id': '235a828f-84a3-44a0-b7af-721ee8be6571', # 'user': {'name': 'User', 'id': '235a828f-84a3-44a0-b7af-721ee8be6571',
# 'email': 'admin@localhost', 'role': 'admin'}} # 'email': 'admin@localhost', 'role': 'admin'}}
re_query = re.compile(r"[^0-9A-Z]", re.IGNORECASE)
re_rough_word = re.compile(r"[\w]+", re.IGNORECASE)
topics = []
dt_start = datetime.now() dt_start = datetime.now()
multi_part = False
streaming = body.get("stream", False)
logger.warning(f"Stream: {streaming}")
context = ""
# examples from https://pypi.org/project/wikipedia/ # examples from https://pypi.org/project/wikipedia/
# new addition - ability to include multiple topics with a semicolon # new addition - ability to include multiple topics with a semicolon
for query in user_message.split(';'): for query in user_message.split(';'):
self.rate_check(dt_start) self.rate_check(dt_start)
query = query.strip() query = query.strip()
if multi_part:
if streaming:
yield "---\n"
else:
context += "---\n"
if body.get("stream", True):
yield from self.stream_retrieve(query, dt_start)
else:
for chunk in self.stream_retrieve(query, dt_start):
context += chunk
multi_part = True
if not streaming:
return context if context else "No information found"
def stream_retrieve(
self, query:str, dt_start: datetime,
) -> Generator:
"""
Retrieve the wikipedia page for the query and return the summary. Return a generator
for streaming responses but can also be iterated for a single response.
"""
re_query = re.compile(r"[^0-9A-Z]", re.IGNORECASE)
re_rough_word = re.compile(r"[\w]+", re.IGNORECASE)
titles_found = None
try: try:
titles_found = wikipedia.search(query) titles_found = wikipedia.search(query)
# r = requests.get( # r = requests.get(
# f"https://en.wikipedia.org/w/api.php?action=opensearch&search={query}&limit=1&namespace=0&format=json" # f"https://en.wikipedia.org/w/api.php?action=opensearch&search={query}&limit=1&namespace=0&format=json"
# ) # )
logger.info(f"Query: {query}, Found: {titles_found}") logger.info(f"Query: {query}, Found: {titles_found}")
topics.append((query, titles_found))
except Exception as e: except Exception as e:
logger.error(f"Search Error: {query} -> {e}") logger.error(f"Search Error: {query} -> {e}")
return f"Page Search Error: {query}" yield f"Page Search Error: {query}"
if titles_found is None or not titles_found: # no results
yield f"No information found for '{query}'"
return
context = ""
for query, titles_found in topics:
self.rate_check(dt_start) self.rate_check(dt_start)
if context: # add separator if multiple topics # if context: # add separator if multiple topics
context += "---\n" # context += "---\n"
try: try:
title_check = titles_found[0] title_check = titles_found[0]
wiki_page = wikipedia.page(title_check, auto_suggest=False) # trick! don't auto-suggest wiki_page = wikipedia.page(title_check, auto_suggest=False) # trick! don't auto-suggest
@ -136,14 +169,14 @@ class Pipeline:
str_error = str(e).replace("\n", ", ") str_error = str(e).replace("\n", ", ")
str_error = f"## Disambiguation Error ({query})\n* Status: {str_error}" str_error = f"## Disambiguation Error ({query})\n* Status: {str_error}"
logger.error(str_error) logger.error(str_error)
context += str_error + "\n" yield str_error + "\n"
continue return
except wikipedia.exceptions.RedirectError as e: except wikipedia.exceptions.RedirectError as e:
str_error = str(e).replace("\n", ", ") str_error = str(e).replace("\n", ", ")
str_error = f"## Redirect Error ({query})\n* Status: {str_error}" str_error = f"## Redirect Error ({query})\n* Status: {str_error}"
logger.error(str_error) logger.error(str_error)
context += str_error + "\n" yield str_error + "\n"
continue return
except Exception as e: except Exception as e:
if titles_found: if titles_found:
str_error = f"## Page Retrieve Error ({query})\n* Found Topics (matched '{title_check}') {titles_found}" str_error = f"## Page Retrieve Error ({query})\n* Found Topics (matched '{title_check}') {titles_found}"
@ -151,36 +184,35 @@ class Pipeline:
else: else:
str_error = f"## Page Not Found ({query})\n* Unknown error" str_error = f"## Page Not Found ({query})\n* Unknown error"
logger.error(f"{str_error} -> {e}") logger.error(f"{str_error} -> {e}")
context += str_error + "\n" yield str_error + "\n"
continue return
# found a page / section # found a page / section
logger.info(f"Page Sections[{query}]: {wiki_page.sections}") logger.info(f"Page Sections[{query}]: {wiki_page.sections}")
context += f"## {title_check}\n" yield f"## {title_check}\n"
# flatten internal links # flatten internal links
# link_md = [f"[{x}]({self.valves.WIKIPEDIA_ROOT}/{re_query.sub('_', x)})" for x in wiki_page.links[:10]] # link_md = [f"[{x}]({self.valves.WIKIPEDIA_ROOT}/{re_query.sub('_', x)})" for x in wiki_page.links[:10]]
# context += "* Links (first 30): " + ",".join(link_md) + "\n" # yield "* Links (first 30): " + ",".join(link_md) + "\n"
# add the textual summary # add the textual summary
summary_full = wiki_page.summary summary_full = wiki_page.summary
word_positions = [x.start() for x in re_rough_word.finditer(summary_full)] word_positions = [x.start() for x in re_rough_word.finditer(summary_full)]
if len(word_positions) > self.valves.WORD_LIMIT: if len(word_positions) > self.valves.WORD_LIMIT:
context += summary_full[:word_positions[self.valves.WORD_LIMIT]] + "...\n" yield summary_full[:word_positions[self.valves.WORD_LIMIT]] + "...\n"
else: else:
context += summary_full + "\n" yield summary_full + "\n"
# the more you know! link to further reading # the more you know! link to further reading
context += "### Learn More" + "\n" yield "### Learn More" + "\n"
context += f"* [Read more on Wikipedia...]({wiki_page.url})\n" yield f"* [Read more on Wikipedia...]({wiki_page.url})\n"
# also spit out the related topics from search # also spit out the related topics from search
link_md = [f"[{x}]({self.valves.WIKIPEDIA_ROOT}/{re_query.sub('_', x)})" for x in titles_found] link_md = [f"[{x}]({self.valves.WIKIPEDIA_ROOT}/{re_query.sub('_', x)})" for x in titles_found]
context += f"* Related topics: {', '.join(link_md)}\n" yield f"* Related topics: {', '.join(link_md)}\n"
# throw in the first image for good measure # throw in the first image for good measure
if wiki_page.images: if wiki_page.images:
context += f"\n![Image: {title_check}]({wiki_page.images[0]})\n" yield f"\n![Image: {title_check}]({wiki_page.images[0]})\n"
# done with querying for different pages return
return context if context else "No information found"