aboutsummaryrefslogtreecommitdiff
path: root/src/server
diff options
context:
space:
mode:
Diffstat (limited to 'src/server')
-rw-r--r--src/server/ApiManagers/AssistantManager.ts5
-rw-r--r--src/server/ApiManagers/DownloadManager.ts262
-rw-r--r--src/server/ApiManagers/UploadManager.ts21
-rw-r--r--src/server/chunker/requirements.txt37
-rw-r--r--src/server/index.ts2
5 files changed, 43 insertions, 284 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 20ec0dfe6..b917f555c 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -532,7 +532,7 @@ export default class AssistantManager extends ApiManager {
try {
// Read the file data and encode it as base64
- const file_data: string = fs.readFileSync(public_path, { encoding: 'base64' });
+ const file_data = fs.readFileSync(public_path, { encoding: 'base64' });
// Generate a unique job ID for tracking
const jobId = uuid.v4();
@@ -781,7 +781,8 @@ function spawnPythonProcess(jobId: string, file_path: string) {
console.log('Virtual environment not found. Creating and setting up...');
// Create venv
- const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]);
+ // const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]);
+ const createVenvProcess = spawn('python3.10', ['-m', 'venv', venvPath]);
createVenvProcess.on('close', code => {
if (code !== 0) {
diff --git a/src/server/ApiManagers/DownloadManager.ts b/src/server/ApiManagers/DownloadManager.ts
deleted file mode 100644
index 5ee21fb44..000000000
--- a/src/server/ApiManagers/DownloadManager.ts
+++ /dev/null
@@ -1,262 +0,0 @@
-import * as Archiver from 'archiver';
-import * as express from 'express';
-import * as path from 'path';
-import { URL } from 'url';
-import { DashUploadUtils, SizeSuffix } from '../DashUploadUtils';
-import { Method } from '../RouteManager';
-import RouteSubscriber from '../RouteSubscriber';
-import { Directory, publicDirectory, serverPathToFile } from '../SocketData';
-import { Database } from '../database';
-import ApiManager, { Registration } from './ApiManager';
-
-export type Hierarchy = { [id: string]: string | Hierarchy };
-export type ZipMutator = (file: Archiver.Archiver) => void | Promise<void>;
-export interface DocumentElements {
- data: string | any[];
- title: string;
-}
-
-/**
- * This is a very specific utility method to help traverse the database
- * to parse data and titles out of images and collections alone.
- *
- * We don't know if the document id given to is corresponds to a view document or a data
- * document. If it's a data document, the response from the database will have
- * a data field. If not, call recursively on the proto, and resolve with *its* data
- *
- * @param targetId the id of the Dash document whose data is being requests
- * @returns the data of the document, as well as its title
- */
-async function getData(targetId: string): Promise<DocumentElements> {
- return new Promise<DocumentElements>((resolve, reject) => {
- Database.Instance.getDocument(targetId, async (result: any) => {
- const { data, proto, title } = result.fields;
- if (data) {
- if (data.url) {
- resolve({ data: data.url, title });
- } else if (data.fields) {
- resolve({ data: data.fields, title });
- } else {
- reject();
- }
- } else if (proto) {
- getData(proto.fieldId).then(resolve, reject);
- } else {
- reject();
- }
- });
- });
-}
-
-/**
- * This function starts with a single document id as a seed,
- * typically that of a collection, and then descends the entire tree
- * of image or collection documents that are reachable from that seed.
- * @param seedId the id of the root of the subtree we're trying to capture, interesting only if it's a collection
- * @param hierarchy the data structure we're going to use to record the nesting of the collections and images as we descend
-
-Below is an example of the JSON hierarchy built from two images contained inside a collection titled 'a nested collection',
-following the general recursive structure shown immediately below
-{
- "parent folder name":{
- "first child's fild name":"first child's url"
- ...
- "nth child's fild name":"nth child's url"
- }
-}
-{
- "a nested collection (865c4734-c036-4d67-a588-c71bb43d1440)":{
- "an image of a cat (ace99ffd-8ed8-4026-a5d5-a353fff57bdd).jpg":"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg",
- "1*SGJw31T5Q9Zfsk24l2yirg.gif (9321cc9b-9b3e-4cb6-b99c-b7e667340f05).gif":"https://cdn-media-1.freecodecamp.org/images/1*SGJw31T5Q9Zfsk24l2yirg.gif"
- }
-}
-*/
-async function buildHierarchyRecursive(seedId: string, hierarchy: Hierarchy): Promise<void> {
- const { title, data } = await getData(seedId);
- const label = `${title} (${seedId})`;
- // is the document a collection?
- if (Array.isArray(data)) {
- // recurse over all documents in the collection.
- const local: Hierarchy = {}; // create a child hierarchy for this level, which will get passed in as the parent of the recursive call
- hierarchy[label] = local; // store it at the index in the parent, so we'll end up with a map of maps of maps
- await Promise.all(data.map(proxy => buildHierarchyRecursive(proxy.fieldId, local)));
- } else {
- // now, data can only be a string, namely the url of the image
- const filename = label + path.extname(data); // this is the file name under which the output image will be stored
- hierarchy[filename] = data;
- }
-}
-
-/**
- * This utility function factors out the process
- * of creating a zip file and sending it back to the client
- * by piping it into a response.
- *
- * Learn more about piping and readable / writable streams here!
- * https://www.freecodecamp.org/news/node-js-streams-everything-you-need-to-know-c9141306be93/
- *
- * @param res the writable stream response object that will transfer the generated zip file
- * @param mutator the callback function used to actually modify and insert information into the zip instance
- */
-export async function BuildAndDispatchZip(res: express.Response, mutator: ZipMutator): Promise<void> {
- res.set('Content-disposition', `attachment;`);
- res.set('Content-Type', 'application/zip');
- const zip = Archiver('zip');
- zip.pipe(res);
- await mutator(zip);
- return zip.finalize();
-}
-
-/**
- *
- * @param file the zip file to which we write the files
- * @param hierarchy the data structure from which we read, defining the nesting of the documents in the zip
- * @param prefix lets us create nested folders in the zip file by continually appending to the end
- * of the prefix with each layer of recursion.
- *
- * Function Call #1 => "Dash Export"
- * Function Call #2 => "Dash Export/a nested collection"
- * Function Call #3 => "Dash Export/a nested collection/lowest level collection"
- * ...
- */
-async function writeHierarchyRecursive(file: Archiver.Archiver, hierarchy: Hierarchy, prefix = 'Dash Export'): Promise<void> {
- // eslint-disable-next-line no-restricted-syntax
- for (const documentTitle in hierarchy) {
- if (Object.prototype.hasOwnProperty.call(hierarchy, documentTitle)) {
- const result = hierarchy[documentTitle];
- // base case or leaf node, we've hit a url (image)
- if (typeof result === 'string') {
- let fPath: string;
- const matches = /:\d+\/files\/images\/(upload_[\da-z]{32}.*)/g.exec(result);
- if (matches !== null) {
- // image already exists on our server
- fPath = serverPathToFile(Directory.images, matches[1]);
- } else {
- // the image doesn't already exist on our server (may have been dragged
- // and dropped in the browser and thus hosted remotely) so we upload it
- // to our server and point the zip file to it, so it can bundle up the bytes
- // eslint-disable-next-line no-await-in-loop
- const information = await DashUploadUtils.UploadImage(result);
- fPath = information instanceof Error ? '' : information.accessPaths[SizeSuffix.Original].server;
- }
- // write the file specified by the path to the directory in the
- // zip file given by the prefix.
- if (fPath) {
- file.file(fPath, { name: documentTitle, prefix });
- }
- } else {
- // we've hit a collection, so we have to recurse
- // eslint-disable-next-line no-await-in-loop
- await writeHierarchyRecursive(file, result, `${prefix}/${documentTitle}`);
- }
- }
- }
-}
-
-async function getDocs(docId: string) {
- const files = new Set<string>();
- const docs: { [id: string]: any } = {};
- const fn = (doc: any): string[] => {
- const { id } = doc;
- if (typeof id === 'string' && id.endsWith('Proto')) {
- // Skip protos
- return [];
- }
- const ids: string[] = [];
- // eslint-disable-next-line no-restricted-syntax
- for (const key in doc.fields) {
- // eslint-disable-next-line no-continue
- if (!Object.prototype.hasOwnProperty.call(doc.fields, key)) continue;
-
- const field = doc.fields[key];
- // eslint-disable-next-line no-continue
- if (field === undefined || field === null) continue;
-
- if (field.__type === 'proxy' || field.__type === 'prefetch_proxy') {
- ids.push(field.fieldId);
- } else if (field.__type === 'script' || field.__type === 'computed') {
- field.captures && ids.push(field.captures.fieldId);
- } else if (field.__type === 'list') {
- ids.push(...fn(field));
- } else if (typeof field === 'string') {
- const re = /"(?:dataD|d)ocumentId"\s*:\s*"([\w-]*)"/g;
- for (let match = re.exec(field); match !== null; match = re.exec(field)) {
- ids.push(match[1]);
- }
- } else if (field.__type === 'RichTextField') {
- const re = /"href"\s*:\s*"(.*?)"/g;
- for (let match = re.exec(field.data); match !== null; match = re.exec(field.Data)) {
- const urlString = match[1];
- const split = new URL(urlString).pathname.split('doc/');
- if (split.length > 1) {
- ids.push(split[split.length - 1]);
- }
- }
- const re2 = /"src"\s*:\s*"(.*?)"/g;
- for (let match = re2.exec(field.Data); match !== null; match = re2.exec(field.Data)) {
- const urlString = match[1];
- const { pathname } = new URL(urlString);
- files.add(pathname);
- }
- } else if (['audio', 'image', 'video', 'pdf', 'web', 'map'].includes(field.__type)) {
- const { pathname } = new URL(field.url);
- files.add(pathname);
- }
- }
-
- if (doc.id) {
- docs[doc.id] = doc;
- }
- return ids;
- };
- await Database.Instance.visit([docId], fn);
- return { id: docId, docs, files };
-}
-
-export default class DownloadManager extends ApiManager {
- protected initialize(register: Registration): void {
- /**
- * Let's say someone's using Dash to organize images in collections.
- * This lets them export the hierarchy they've built to their
- * own file system in a useful format.
- *
- * This handler starts with a single document id (interesting only
- * if it's that of a collection). It traverses the database, captures
- * the nesting of only nested images or collections, writes
- * that to a zip file and returns it to the client for download.
- */
- register({
- method: Method.GET,
- subscription: new RouteSubscriber('imageHierarchyExport').add('docId'),
- secureHandler: async ({ req, res }) => {
- const id = req.params.docId;
- const hierarchy: Hierarchy = {};
- await buildHierarchyRecursive(id, hierarchy);
- return BuildAndDispatchZip(res, zip => writeHierarchyRecursive(zip, hierarchy));
- },
- });
-
- register({
- method: Method.GET,
- subscription: new RouteSubscriber('downloadId').add('docId'),
- secureHandler: async ({ req, res }) =>
- BuildAndDispatchZip(res, async zip => {
- const { id, docs, files } = await getDocs(req.params.docId);
- const docString = JSON.stringify({ id, docs });
- zip.append(docString, { name: 'doc.json' });
- files.forEach(val => {
- zip.file(publicDirectory + val, { name: val.substring(1) });
- });
- }),
- });
-
- register({
- method: Method.GET,
- subscription: new RouteSubscriber('serializeDoc').add('docId'),
- secureHandler: async ({ req, res }) => {
- const { docs, files } = await getDocs(req.params.docId);
- res.send({ docs, files: Array.from(files) });
- },
- });
- }
-}
diff --git a/src/server/ApiManagers/UploadManager.ts b/src/server/ApiManagers/UploadManager.ts
index 1e68a4e30..5e527281f 100644
--- a/src/server/ApiManagers/UploadManager.ts
+++ b/src/server/ApiManagers/UploadManager.ts
@@ -131,6 +131,9 @@ export default class UploadManager extends ApiManager {
},
});
+ type fieldstype = string | { __type: string; Data: string } | { __type: string; id: string; fieldId: string; fields: fieldstype[]; captures: { fieldId: string } };
+ type doctype = { id: string; fields: fieldstype[] };
+
register({
method: Method.POST,
subscription: '/uploadDoc',
@@ -145,7 +148,7 @@ export default class UploadManager extends ApiManager {
ids[id] = uuid.v4();
return ids[id];
};
- const mapFn = (docIn: { id: string; fields: any[] }) => {
+ const mapFn = (docIn: doctype) => {
const doc = docIn;
if (doc.id) {
doc.id = getId(doc.id);
@@ -156,22 +159,20 @@ export default class UploadManager extends ApiManager {
const field = doc.fields[key];
if (field === undefined || field === null) continue;
- if (field.__type === 'Doc') {
- mapFn(field);
+ if (typeof field === 'string') {
+ const re = /("(?:dataD|d)ocumentId"\s*:\s*")([\w-]*)"/g;
+ doc.fields[key] = field.replace(re, (match: string, p1: string, p2: string) => `${p1}${getId(p2)}"`);
+ } else if ('Data' in field) {
+ const re = /("href"\s*:\s*")(.*?)"/g;
+ field.Data = field.Data.replace(re, (match: string, p1: string, p2: string) => `${p1}${getId(p2)}"`);
} else if (field.__type === 'proxy' || field.__type === 'prefetch_proxy') {
field.fieldId = getId(field.fieldId);
} else if (field.__type === 'script' || field.__type === 'computed') {
if (field.captures) {
field.captures.fieldId = getId(field.captures.fieldId);
}
- } else if (field.__type === 'list') {
+ } else if (field.__type === 'list' || field.__type === 'Doc') {
mapFn(field);
- } else if (typeof field === 'string') {
- const re = /("(?:dataD|d)ocumentId"\s*:\s*")([\w-]*)"/g;
- doc.fields[key] = field.replace(re, (match: string, p1: string, p2: string) => `${p1}${getId(p2)}"`);
- } else if (field.__type === 'RichTextField') {
- const re = /("href"\s*:\s*")(.*?)"/g;
- field.Data = field.Data.replace(re, (match: string, p1: string, p2: string) => `${p1}${getId(p2)}"`);
}
}
};
diff --git a/src/server/chunker/requirements.txt b/src/server/chunker/requirements.txt
index 20bd486e5..586bbe505 100644
--- a/src/server/chunker/requirements.txt
+++ b/src/server/chunker/requirements.txt
@@ -1,15 +1,36 @@
+# Prefer official CPU wheels from the PyTorch index
+--extra-index-url https://download.pytorch.org/whl/cpu
+
+###############################################################################
+# Stable env for pdf_chunker.py #
+###############################################################################
+
+# ─── LLM clients ─────────────────────────────────────────────────────────────
+openai==1.40.6
+httpx==0.27.2 # <0.28 → avoids "proxies=" crash
anthropic==0.34.0
cohere==5.8.0
-python-dotenv==1.0.1
+
+# ─── Torch stack (CPU) ───────────────────────────────────────────────────────
+torch<=2.7.1
+torchvision<=0.22.1 # matches torch 2.5.x
+torchaudio<=2.7.1
+
+# ─── Vision / OCR / PDF processing ───────────────────────────────────────────
+ultralyticsplus==0.0.28
+easyocr==1.7.0
pymupdf==1.22.2
-lxml==5.3.0
+PyPDF2==3.0.1
+pytesseract==0.3.10
+Pillow==10.4.0
layoutparser==0.3.4
+lxml==5.3.0
+
+# ─── ML / maths ──────────────────────────────────────────────────────────────
numpy==1.26.4
-openai==1.40.6
-Pillow==10.4.0
-pytesseract==0.3.10
-PyPDF2==3.0.1
scikit-learn==1.5.1
+
+# ─── Utilities ──────────────────────────────────────────────────────────────
tqdm==4.66.5
-ultralyticsplus==0.0.28
-easyocr==1.7.0 \ No newline at end of file
+python-dotenv==1.0.1
+packaging==24.0 \ No newline at end of file
diff --git a/src/server/index.ts b/src/server/index.ts
index 3b77359ec..eb9bbaa2d 100644
--- a/src/server/index.ts
+++ b/src/server/index.ts
@@ -7,7 +7,6 @@ import AssistantManager from './ApiManagers/AssistantManager';
import FlashcardManager from './ApiManagers/FlashcardManager';
import DataVizManager from './ApiManagers/DataVizManager';
import DeleteManager from './ApiManagers/DeleteManager';
-import DownloadManager from './ApiManagers/DownloadManager';
import FireflyManager from './ApiManagers/FireflyManager';
import GeneralGoogleManager from './ApiManagers/GeneralGoogleManager';
import SessionManager from './ApiManagers/SessionManager';
@@ -67,7 +66,6 @@ function routeSetter({ addSupervisedRoute, logRegistrationOutcome }: RouteManage
new SessionManager(),
new UserManager(),
new UploadManager(),
- new DownloadManager(),
new DeleteManager(),
new UtilManager(),
new GeneralGoogleManager(),