aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts13
-rw-r--r--src/server/ApiManagers/AssistantManager.ts70
2 files changed, 49 insertions, 34 deletions
diff --git a/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts
index 59fd47b7a..c59afefbd 100644
--- a/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts
@@ -24,10 +24,19 @@ export class WebsiteInfoScraperTool extends BaseTool<{ url: string }> {
async execute(args: { url: string }): Promise<any> {
try {
- const { html } = await Networking.PostToServer('/scrapeWebsite', { url: args.url });
+ const { website_image_base64 } = await Networking.PostToServer('/scrapeWebsite', { url: args.url });
const id = uuidv4();
this._addLinkedUrlDoc(args.url, id);
- return [{ type: 'text', text: `<chunk chunk_id=${id} chunk_type=text> ${html} </chunk>` }];
+ return [
+ { type: 'text', text: `<chunk chunk_id=${id} chunk_type=url> ` },
+ {
+ type: 'image_url',
+ image_url: {
+ url: `data:image/jpeg;base64,${website_image_base64}`,
+ },
+ },
+ { type: 'text', text: `</chunk>\n` },
+ ];
} catch (error) {
return [{ type: 'text', text: 'An error occurred while scraping the website.' }];
}
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 6460edb9a..c034960c9 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -13,6 +13,7 @@ import { UnstructuredClient } from 'unstructured-client';
import { PartitionResponse } from 'unstructured-client/sdk/models/operations';
import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/shared';
import * as cheerio from 'cheerio';
+import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';
export enum Directory {
parsed_files = 'parsed_files',
@@ -51,6 +52,7 @@ export default class AssistantManager extends ApiManager {
apiKeyAuth: process.env._CLIENT_UNSTRUCTURED_API_KEY!,
},
});
+ const scrapflyClient = new ScrapflyClient({ key: process.env._CLIENT_SCRAPFLY_API_KEY! });
register({
method: Method.POST,
@@ -136,38 +138,41 @@ export default class AssistantManager extends ApiManager {
secureHandler: async ({ req, res }) => {
const { url } = req.body;
try {
- const html = await fetchWithRetry(url);
- const $ = cheerio.load(html);
-
- // Remove script, style tags, and other non-content elements
- $('script, style, noscript, iframe, svg, img, video, audio').remove();
-
- // Function to extract text from an element and its children
- function extractText(element: any): string {
- let text = '';
- element.contents().each((_: any, el: any) => {
- if (el.type === 'text') {
- text += $(el).text().trim() + ' ';
- } else if (el.type === 'tag' && !['script', 'style'].includes(el.name)) {
- text += extractText($(el)) + ' ';
- }
+ const result = await scrapflyClient.scrape(
+ new ScrapeConfig({
+ url: url,
+ // enable headless browsers for screenshots
+ render_js: true,
+ // optional: you can wait for page to load before capturing
+ screenshots: {
+ // name: what-to-capture
+ // fullpage - will capture everything
+ // css selector (e.g. #reviews) - will capture just that element
+ everything: 'fullpage',
+ },
+ })
+ );
+ console.log(result.result.screenshots);
+
+ for (let [name, screenshot] of Object.entries(result.result.screenshots)) {
+ let response = await axios.get(screenshot.url, {
+ // note: don't forget to add your API key parameter:
+ params: { key: process.env._CLIENT_SCRAPFLY_API_KEY!, options: 'print_media_format' },
+ // this indicates that response is binary data:
+ responseType: 'arraybuffer',
});
- return text.trim();
- }
-
- // Extract all visible text from the body
- const bodyText = extractText($('body'));
-
- // Split the text into lines and remove empty lines
- const lines = bodyText
- .split('\n')
- .map(line => line.trim())
- .filter(line => line.length > 0);
-
- // Join the lines back together
- const extractedContent = lines.join('\n');
+ // write to screenshot data to a file in current directory:
+ fs.writeFileSync(`example-screenshot-${name}.${screenshot.extension}`, response.data);
+ const base64String = response.data.toString('base64');
+ const directory = path.join(publicDirectory, '/files/scrape_images/');
+ if (!fs.existsSync(directory)) {
+ fs.mkdirSync(directory);
+ }
+ const filePath = path.join(directory, 'example-screenshot-' + name + '.' + screenshot.extension);
+ await fs.promises.writeFile(filePath, response.data);
- res.send({ content: extractedContent });
+ res.send({ website_image_base64: base64String });
+ }
} catch (error: any) {
console.error('Error scraping website:', error);
res.status(500).send({ error: 'Failed to scrape website', details: error.message });
@@ -282,9 +287,10 @@ export default class AssistantManager extends ApiManager {
} catch (error) {
console.error(`Error reading image file for chunk ${chunk.id}:`, error);
}
+ content.push({ type: 'text', text: `\n</chunk>\n` });
+ } else {
+ content.push({ type: 'text', text: `${chunk.metadata.text}\n</chunk>\n` });
}
-
- content.push({ type: 'text', text: `${chunk.metadata.text}\n</chunk>\n` });
}
content.push({ type: 'text', text: '</chunks>' });