diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2024-08-16 12:41:45 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2024-08-16 12:41:45 -0400 |
commit | f2eac77e4073ec06ab0cccca9aa8f98916f62d5b (patch) | |
tree | ccf11ae5a7d5afe6df1fa307e7fbc30f696de082 | |
parent | 6f9b8f9b393d411a17f7954b6cc36618efe698e2 (diff) |
Webscraping works with Screenshot API
-rw-r--r-- | package-lock.json | 66 | ||||
-rw-r--r-- | package.json | 1 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts | 13 | ||||
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 70 |
4 files changed, 116 insertions, 34 deletions
diff --git a/package-lock.json b/package-lock.json index c73470775..601addf4e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -216,6 +216,7 @@ "rimraf": "^6.0.0", "sass": "^1.69.5", "sass-loader": "^14.2.0", + "scrapfly-sdk": "^0.6.4", "serializr": "^3.0.2", "shelljs": "^0.8.5", "socket.io": "^4.7.2", @@ -3309,6 +3310,42 @@ "@jridgewell/sourcemap-codec": "^1.4.10" } }, + "node_modules/@deno/shim-deno": { + "version": "0.18.2", + "resolved": "https://registry.npmjs.org/@deno/shim-deno/-/shim-deno-0.18.2.tgz", + "integrity": "sha512-oQ0CVmOio63wlhwQF75zA4ioolPvOwAoK0yuzcS5bDC1JUvH3y1GS8xPh8EOpcoDQRU4FTG8OQfxhpR+c6DrzA==", + "dependencies": { + "@deno/shim-deno-test": "^0.5.0", + "which": "^4.0.0" + } + }, + "node_modules/@deno/shim-deno-test": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@deno/shim-deno-test/-/shim-deno-test-0.5.0.tgz", + "integrity": "sha512-4nMhecpGlPi0cSzT67L+Tm+GOJqvuk8gqHBziqcUQOarnuIax1z96/gJHCSIz2Z0zhxE6Rzwb3IZXPtFh51j+w==" + }, + "node_modules/@deno/shim-deno/node_modules/isexe": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/isexe/-/isexe-3.1.1.tgz", + "integrity": "sha512-LpB/54B+/2J5hqQ7imZHfdU31OlgQqx7ZicVlkm9kzg9/w8GKLEcFfJl/t7DCEDueOyBAD6zCCwTO6Fzs0NoEQ==", + "engines": { + "node": ">=16" + } + }, + "node_modules/@deno/shim-deno/node_modules/which": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/which/-/which-4.0.0.tgz", + "integrity": "sha512-GlaYyEb07DPxYCKhKzplCWBJtvxZcZMrL+4UkrTSJHHPyZU4mYYTv3qaOe77H7EODLSSopAUFAc6W8U4yqvscg==", + "dependencies": { + "isexe": "^3.1.1" + }, + "bin": { + "node-which": "bin/which.js" + }, + "engines": { + "node": "^16.13.0 || >=18.0.0" + } + }, "node_modules/@discoveryjs/json-ext": { "version": "0.5.7", "resolved": "https://registry.npmjs.org/@discoveryjs/json-ext/-/json-ext-0.5.7.tgz", @@ -39681,6 +39718,35 @@ "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==" }, + "node_modules/scrapfly-sdk": { + "version": "0.6.4", + "resolved": "https://registry.npmjs.org/scrapfly-sdk/-/scrapfly-sdk-0.6.4.tgz", + "integrity": "sha512-hP7WK+jhcANKa2+fEpvLKee5wNRgQlw1xCPfKo/x8CTVccAOKaUYm2P6OqLHg5mINIBHDSHhOXjBRpuKo/Cd/w==", + "dependencies": { + "@deno/shim-deno": "~0.18.0", + "cheerio": "1.0.0-rc.12" + } + }, + "node_modules/scrapfly-sdk/node_modules/cheerio": { + "version": "1.0.0-rc.12", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz", + "integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==", + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1", + "htmlparser2": "^8.0.1", + "parse5": "^7.0.0", + "parse5-htmlparser2-tree-adapter": "^7.0.0" + }, + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, "node_modules/scss-loader": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/scss-loader/-/scss-loader-0.0.1.tgz", diff --git a/package.json b/package.json index e285205dd..544917e78 100644 --- a/package.json +++ b/package.json @@ -301,6 +301,7 @@ "rimraf": "^6.0.0", "sass": "^1.69.5", "sass-loader": "^14.2.0", + "scrapfly-sdk": "^0.6.4", "serializr": "^3.0.2", "shelljs": "^0.8.5", "socket.io": "^4.7.2", diff --git a/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts index 59fd47b7a..c59afefbd 100644 --- a/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts +++ b/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts @@ -24,10 +24,19 @@ export class WebsiteInfoScraperTool extends BaseTool<{ url: string }> { async execute(args: { url: string }): Promise<any> { try { - const { html } = await Networking.PostToServer('/scrapeWebsite', { url: args.url }); + const { website_image_base64 } = await Networking.PostToServer('/scrapeWebsite', { url: args.url }); const id = uuidv4(); this._addLinkedUrlDoc(args.url, id); - return [{ type: 'text', text: `<chunk chunk_id=${id} chunk_type=text> ${html} </chunk>` }]; + return [ + { type: 'text', text: `<chunk chunk_id=${id} chunk_type=url> ` }, + { + type: 'image_url', + image_url: { + url: `data:image/jpeg;base64,${website_image_base64}`, + }, + }, + { type: 'text', text: `</chunk>\n` }, + ]; } catch (error) { return [{ type: 'text', text: 'An error occurred while scraping the website.' }]; } diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index 6460edb9a..c034960c9 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -13,6 +13,7 @@ import { UnstructuredClient } from 'unstructured-client'; import { PartitionResponse } from 'unstructured-client/sdk/models/operations'; import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/shared'; import * as cheerio from 'cheerio'; +import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk'; export enum Directory { parsed_files = 'parsed_files', @@ -51,6 +52,7 @@ export default class AssistantManager extends ApiManager { apiKeyAuth: process.env._CLIENT_UNSTRUCTURED_API_KEY!, }, }); + const scrapflyClient = new ScrapflyClient({ key: process.env._CLIENT_SCRAPFLY_API_KEY! }); register({ method: Method.POST, @@ -136,38 +138,41 @@ export default class AssistantManager extends ApiManager { secureHandler: async ({ req, res }) => { const { url } = req.body; try { - const html = await fetchWithRetry(url); - const $ = cheerio.load(html); - - // Remove script, style tags, and other non-content elements - $('script, style, noscript, iframe, svg, img, video, audio').remove(); - - // Function to extract text from an element and its children - function extractText(element: any): string { - let text = ''; - element.contents().each((_: any, el: any) => { - if (el.type === 'text') { - text += $(el).text().trim() + ' '; - } else if (el.type === 'tag' && !['script', 'style'].includes(el.name)) { - text += extractText($(el)) + ' '; - } + const result = await scrapflyClient.scrape( + new ScrapeConfig({ + url: url, + // enable headless browsers for screenshots + render_js: true, + // optional: you can wait for page to load before capturing + screenshots: { + // name: what-to-capture + // fullpage - will capture everything + // css selector (e.g. #reviews) - will capture just that element + everything: 'fullpage', + }, + }) + ); + console.log(result.result.screenshots); + + for (let [name, screenshot] of Object.entries(result.result.screenshots)) { + let response = await axios.get(screenshot.url, { + // note: don't forget to add your API key parameter: + params: { key: process.env._CLIENT_SCRAPFLY_API_KEY!, options: 'print_media_format' }, + // this indicates that response is binary data: + responseType: 'arraybuffer', }); - return text.trim(); - } - - // Extract all visible text from the body - const bodyText = extractText($('body')); - - // Split the text into lines and remove empty lines - const lines = bodyText - .split('\n') - .map(line => line.trim()) - .filter(line => line.length > 0); - - // Join the lines back together - const extractedContent = lines.join('\n'); + // write to screenshot data to a file in current directory: + fs.writeFileSync(`example-screenshot-${name}.${screenshot.extension}`, response.data); + const base64String = response.data.toString('base64'); + const directory = path.join(publicDirectory, '/files/scrape_images/'); + if (!fs.existsSync(directory)) { + fs.mkdirSync(directory); + } + const filePath = path.join(directory, 'example-screenshot-' + name + '.' + screenshot.extension); + await fs.promises.writeFile(filePath, response.data); - res.send({ content: extractedContent }); + res.send({ website_image_base64: base64String }); + } } catch (error: any) { console.error('Error scraping website:', error); res.status(500).send({ error: 'Failed to scrape website', details: error.message }); @@ -282,9 +287,10 @@ export default class AssistantManager extends ApiManager { } catch (error) { console.error(`Error reading image file for chunk ${chunk.id}:`, error); } + content.push({ type: 'text', text: `\n</chunk>\n` }); + } else { + content.push({ type: 'text', text: `${chunk.metadata.text}\n</chunk>\n` }); } - - content.push({ type: 'text', text: `${chunk.metadata.text}\n</chunk>\n` }); } content.push({ type: 'text', text: '</chunks>' }); |