aboutsummaryrefslogtreecommitdiff
path: root/src/server/ApiManagers/AssistantManager.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r--src/server/ApiManagers/AssistantManager.ts47
1 files changed, 22 insertions, 25 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index afaeaf961..a59a2d22d 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -15,7 +15,9 @@ import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/share
import * as cheerio from 'cheerio';
import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';
import { google } from 'googleapis';
-import puppeteer from 'puppeteer';
+import * as puppeteer from 'puppeteer';
+import { JSDOM } from 'jsdom';
+import { Readability } from '@mozilla/readability';
export enum Directory {
parsed_files = 'parsed_files',
@@ -142,39 +144,34 @@ export default class AssistantManager extends ApiManager {
secureHandler: async ({ req, res }) => {
const { url } = req.body;
try {
- const url_filename = url.replace(/\./g, '-').replace(/\//g, '_') + '.jpg';
- const scrapedImagesDirectory = pathToDirectory(Directory.scrape_images);
- const filePath = serverPathToFile(Directory.scrape_images, url_filename);
-
- // Check if the image already exists
- if (fs.existsSync(filePath)) {
- const imageBuffer = await readFileAsync(filePath);
- const base64Image = imageBuffer.toString('base64');
- console.log('Image already exists');
- return res.send({ website_image_base64: base64Image });
- }
-
- // Create the directory if it doesn't exist
- if (!fs.existsSync(scrapedImagesDirectory)) {
- fs.mkdirSync(scrapedImagesDirectory);
- }
-
- // Launch Puppeteer to take a screenshot of the webpage
+ // Launch Puppeteer to navigate to the webpage
const browser = await puppeteer.launch({
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
await page.goto(url, { waitUntil: 'networkidle2' });
- const screenshotBuffer = await page.screenshot({ fullPage: true });
+
+ // Get the HTML content of the page
+ const htmlContent = await page.content();
await browser.close();
- // Save the screenshot to the file system
- await writeFileAsync(filePath, screenshotBuffer);
+ // Use JSDOM to parse the HTML content
+ const dom = new JSDOM(htmlContent, { url });
- // Return the base64-encoded image
- const base64Image = Buffer.from(screenshotBuffer).toString('base64');
- res.send({ website_image_base64: base64Image });
+ // Use Readability to extract the readable content
+ const reader = new Readability(dom.window.document);
+ const article = reader.parse();
+
+ if (article) {
+ // Extract the plain text from the article content
+ const plainText = article.textContent;
+
+ // Return the plain text content
+ res.send({ website_plain_text: plainText });
+ } else {
+ res.status(500).send({ error: 'Failed to extract readable content' });
+ }
} catch (error: any) {
console.error('Error scraping website:', error);
res.status(500).send({ error: 'Failed to scrape website', details: error.message });