*TaskWeaver DuckDuckGo Browserless *

internet_search.py

import requests
from duckduckgo_search import DDGS
from taskweaver.plugin import Plugin, register_plugin
import os, json
from unstructured.partition.html import partition_html

@register_plugin
class InternetSearchPlugin(Plugin):
def call(self, query: str, max_results=2):
with DDGS() as ddgs:
results = [r for r in ddgs.text(query, max_results=max_results)]
scraped_results = []
for result in results:
url = self.sanitize_url(result.get('href'))
scraped_content = self.scrape_website(url)
scraped_results.append({'url': url, 'content': scraped_content})
return scraped_results

@staticmethod
def sanitize_url(url):
    """Sanitize the URL by removing any trailing slashes."""
    return url.rstrip('/') if url else url

def scrape_website(self, website: str):
    """Just pass a string with
    only the full url, no need for a final slash `/`, eg: https://google.com or https://clearbit.com/about-us"""
    sanitized_url = self.sanitize_url(website)
    url = f"https://chrome.browserless.io/content?token={os.environ['BROWSERLESS_API_KEY']}"
    payload = json.dumps({"url": sanitized_url})
    headers = {'cache-control': 'no-cache', 'content-type': 'application/json'}
    response = requests.request("POST", url, headers=headers, data=payload)
    elements = partition_html(text=response.text)
    content = "\n\n".join([str(el) for el in elements])
    content = [content[i:i + 8000] for i in range(0, len(content), 8000)]
    return content

internet_search.yaml

name: internet_search
enabled: true
required: false
description: >-
The InternetSearchPlugin performs internet searches using DuckDuckGo and scrapes the content of the search results.

parameters:

name: query type: str required: true description: The search query for retrieving and scraping internet search results.

returns:

name: search_results type: list description: >- A list of dictionaries containing the URL and scraped content of each search result.