fixed citations so they show different numbers

author: A.J. Shulman <Shulman.aj@gmail.com> 2024-08-21 14:13:56 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2024-08-21 14:13:56 -0400
commit: e5464e4c04ef6f8a2bbf868b43bbcdba54239406 (patch)
tree: 3e9f93eb0fd6fc3448984f8f383386e3779ea296 /src/server/ApiManagers/AssistantManager.ts
parent: 79e4c4a3fba42b90ffa656db3ca435505f978afe (diff)
1 files changed, 22 insertions, 25 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index afaeaf961..a59a2d22d 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -15,7 +15,9 @@ import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/share
 import * as cheerio from 'cheerio';
 import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';
 import { google } from 'googleapis';
-import puppeteer from 'puppeteer';
+import * as puppeteer from 'puppeteer';
+import { JSDOM } from 'jsdom';
+import { Readability } from '@mozilla/readability';
 
 export enum Directory {
     parsed_files = 'parsed_files',
@@ -142,39 +144,34 @@ export default class AssistantManager extends ApiManager {
             secureHandler: async ({ req, res }) => {
                 const { url } = req.body;
                 try {
-                    const url_filename = url.replace(/\./g, '-').replace(/\//g, '_') + '.jpg';
-                    const scrapedImagesDirectory = pathToDirectory(Directory.scrape_images);
-                    const filePath = serverPathToFile(Directory.scrape_images, url_filename);
-
-                    // Check if the image already exists
-                    if (fs.existsSync(filePath)) {
-                        const imageBuffer = await readFileAsync(filePath);
-                        const base64Image = imageBuffer.toString('base64');
-                        console.log('Image already exists');
-                        return res.send({ website_image_base64: base64Image });
-                    }
-
-                    // Create the directory if it doesn't exist
-                    if (!fs.existsSync(scrapedImagesDirectory)) {
-                        fs.mkdirSync(scrapedImagesDirectory);
-                    }
-
-                    // Launch Puppeteer to take a screenshot of the webpage
+                    // Launch Puppeteer to navigate to the webpage
                     const browser = await puppeteer.launch({
                         args: ['--no-sandbox', '--disable-setuid-sandbox'],
                     });
                     const page = await browser.newPage();
                     await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
                     await page.goto(url, { waitUntil: 'networkidle2' });
-                    const screenshotBuffer = await page.screenshot({ fullPage: true });
+
+                    // Get the HTML content of the page
+                    const htmlContent = await page.content();
                     await browser.close();
 
-                    // Save the screenshot to the file system
-                    await writeFileAsync(filePath, screenshotBuffer);
+                    // Use JSDOM to parse the HTML content
+                    const dom = new JSDOM(htmlContent, { url });
 
-                    // Return the base64-encoded image
-                    const base64Image = Buffer.from(screenshotBuffer).toString('base64');
-                    res.send({ website_image_base64: base64Image });
+                    // Use Readability to extract the readable content
+                    const reader = new Readability(dom.window.document);
+                    const article = reader.parse();
+
+                    if (article) {
+                        // Extract the plain text from the article content
+                        const plainText = article.textContent;
+
+                        // Return the plain text content
+                        res.send({ website_plain_text: plainText });
+                    } else {
+                        res.status(500).send({ error: 'Failed to extract readable content' });
+                    }
                 } catch (error: any) {
                     console.error('Error scraping website:', error);
                     res.status(500).send({ error: 'Failed to scrape website', details: error.message });
author	A.J. Shulman <Shulman.aj@gmail.com>	2024-08-21 14:13:56 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2024-08-21 14:13:56 -0400
commit	e5464e4c04ef6f8a2bbf868b43bbcdba54239406 (patch)
tree	3e9f93eb0fd6fc3448984f8f383386e3779ea296 /src/server/ApiManagers/AssistantManager.ts
parent	79e4c4a3fba42b90ffa656db3ca435505f978afe (diff)