diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/client/views/nodes/ChatBox/MessageComponent.tsx | 3 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts | 17 | ||||
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 47 |
3 files changed, 28 insertions, 39 deletions
diff --git a/src/client/views/nodes/ChatBox/MessageComponent.tsx b/src/client/views/nodes/ChatBox/MessageComponent.tsx index 00e9795e3..3edfb272c 100644 --- a/src/client/views/nodes/ChatBox/MessageComponent.tsx +++ b/src/client/views/nodes/ChatBox/MessageComponent.tsx @@ -16,6 +16,7 @@ const MessageComponentBox: React.FC<MessageComponentProps> = function ({ message const renderContent = (item: MessageContent) => { const i = item.index; + console.log('item', item, 'index', i); if (item.type === TEXT_TYPE.GROUNDED) { const citation_ids = item.citation_ids || []; return ( @@ -26,7 +27,7 @@ const MessageComponentBox: React.FC<MessageComponentProps> = function ({ message if (!citation) return null; return ( <button key={i + idx} className="citation-button" onClick={() => onCitationClick(citation)}> - {idx + 1} + {i + 1} </button> ); })} diff --git a/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts index 739329bea..4588b5aec 100644 --- a/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts +++ b/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts @@ -17,8 +17,8 @@ export class WebsiteInfoScraperTool extends BaseTool<{ url: string | string[] }> max_inputs: '3', }, }, - 'Provide up to 3 URLs of websites that you have identified as the most relevant from the previous search. This tool will provide screenshots of those specific websites. It will also create a document from the scraped content for future reference. When providing a response to the user, ideally reference as many of the websites as possible in order to provide a well grounded result.', - 'Returns the full content of the webpages as images for analysis.' + 'Provide up to 3 URLs of websites that you have identified as the most relevant from the previous search. This tool will provide the text content of those specific websites. When providing a final response to the user based on information from these chunks, ideally cite as many of the url chunks as possible (ground your infromation from multiple sources, if possible) in order to provide a well grounded result.', + 'Returns the text content of the webpages for analysis.' ); this._addLinkedUrlDoc = addLinkedUrlDoc; } @@ -29,19 +29,10 @@ export class WebsiteInfoScraperTool extends BaseTool<{ url: string | string[] }> for (const url of urls) { try { - const { website_image_base64 } = await Networking.PostToServer('/scrapeWebsite', { url }); + const { website_plain_text } = await Networking.PostToServer('/scrapeWebsite', { url }); const id = uuidv4(); this._addLinkedUrlDoc(url, id); - results.push( - { type: 'text', text: `<chunk chunk_id=${id} chunk_type=url> ` }, - { - type: 'image_url', - image_url: { - url: `data:image/jpeg;base64,${website_image_base64}`, - }, - }, - { type: 'text', text: `</chunk>\n` } - ); + results.push({ type: 'text', text: `<chunk chunk_id=${id} chunk_type=url>\n${website_plain_text}\n</chunk>\n` }); } catch (error) { results.push({ type: 'text', text: `An error occurred while scraping the website: ${url}` }); } diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index afaeaf961..a59a2d22d 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -15,7 +15,9 @@ import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/share import * as cheerio from 'cheerio'; import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk'; import { google } from 'googleapis'; -import puppeteer from 'puppeteer'; +import * as puppeteer from 'puppeteer'; +import { JSDOM } from 'jsdom'; +import { Readability } from '@mozilla/readability'; export enum Directory { parsed_files = 'parsed_files', @@ -142,39 +144,34 @@ export default class AssistantManager extends ApiManager { secureHandler: async ({ req, res }) => { const { url } = req.body; try { - const url_filename = url.replace(/\./g, '-').replace(/\//g, '_') + '.jpg'; - const scrapedImagesDirectory = pathToDirectory(Directory.scrape_images); - const filePath = serverPathToFile(Directory.scrape_images, url_filename); - - // Check if the image already exists - if (fs.existsSync(filePath)) { - const imageBuffer = await readFileAsync(filePath); - const base64Image = imageBuffer.toString('base64'); - console.log('Image already exists'); - return res.send({ website_image_base64: base64Image }); - } - - // Create the directory if it doesn't exist - if (!fs.existsSync(scrapedImagesDirectory)) { - fs.mkdirSync(scrapedImagesDirectory); - } - - // Launch Puppeteer to take a screenshot of the webpage + // Launch Puppeteer to navigate to the webpage const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); await page.goto(url, { waitUntil: 'networkidle2' }); - const screenshotBuffer = await page.screenshot({ fullPage: true }); + + // Get the HTML content of the page + const htmlContent = await page.content(); await browser.close(); - // Save the screenshot to the file system - await writeFileAsync(filePath, screenshotBuffer); + // Use JSDOM to parse the HTML content + const dom = new JSDOM(htmlContent, { url }); - // Return the base64-encoded image - const base64Image = Buffer.from(screenshotBuffer).toString('base64'); - res.send({ website_image_base64: base64Image }); + // Use Readability to extract the readable content + const reader = new Readability(dom.window.document); + const article = reader.parse(); + + if (article) { + // Extract the plain text from the article content + const plainText = article.textContent; + + // Return the plain text content + res.send({ website_plain_text: plainText }); + } else { + res.status(500).send({ error: 'Failed to extract readable content' }); + } } catch (error: any) { console.error('Error scraping website:', error); res.status(500).send({ error: 'Failed to scrape website', details: error.message }); |