diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2024-08-21 14:13:56 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2024-08-21 14:13:56 -0400 |
commit | e5464e4c04ef6f8a2bbf868b43bbcdba54239406 (patch) | |
tree | 3e9f93eb0fd6fc3448984f8f383386e3779ea296 /src/server/ApiManagers/AssistantManager.ts | |
parent | 79e4c4a3fba42b90ffa656db3ca435505f978afe (diff) |
fixed citations so they show different numbers
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 47 |
1 files changed, 22 insertions, 25 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index afaeaf961..a59a2d22d 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -15,7 +15,9 @@ import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/share import * as cheerio from 'cheerio'; import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk'; import { google } from 'googleapis'; -import puppeteer from 'puppeteer'; +import * as puppeteer from 'puppeteer'; +import { JSDOM } from 'jsdom'; +import { Readability } from '@mozilla/readability'; export enum Directory { parsed_files = 'parsed_files', @@ -142,39 +144,34 @@ export default class AssistantManager extends ApiManager { secureHandler: async ({ req, res }) => { const { url } = req.body; try { - const url_filename = url.replace(/\./g, '-').replace(/\//g, '_') + '.jpg'; - const scrapedImagesDirectory = pathToDirectory(Directory.scrape_images); - const filePath = serverPathToFile(Directory.scrape_images, url_filename); - - // Check if the image already exists - if (fs.existsSync(filePath)) { - const imageBuffer = await readFileAsync(filePath); - const base64Image = imageBuffer.toString('base64'); - console.log('Image already exists'); - return res.send({ website_image_base64: base64Image }); - } - - // Create the directory if it doesn't exist - if (!fs.existsSync(scrapedImagesDirectory)) { - fs.mkdirSync(scrapedImagesDirectory); - } - - // Launch Puppeteer to take a screenshot of the webpage + // Launch Puppeteer to navigate to the webpage const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); await page.goto(url, { waitUntil: 'networkidle2' }); - const screenshotBuffer = await page.screenshot({ fullPage: true }); + + // Get the HTML content of the page + const htmlContent = await page.content(); await browser.close(); - // Save the screenshot to the file system - await writeFileAsync(filePath, screenshotBuffer); + // Use JSDOM to parse the HTML content + const dom = new JSDOM(htmlContent, { url }); - // Return the base64-encoded image - const base64Image = Buffer.from(screenshotBuffer).toString('base64'); - res.send({ website_image_base64: base64Image }); + // Use Readability to extract the readable content + const reader = new Readability(dom.window.document); + const article = reader.parse(); + + if (article) { + // Extract the plain text from the article content + const plainText = article.textContent; + + // Return the plain text content + res.send({ website_plain_text: plainText }); + } else { + res.status(500).send({ error: 'Failed to extract readable content' }); + } } catch (error: any) { console.error('Error scraping website:', error); res.status(500).send({ error: 'Failed to scrape website', details: error.message }); |