diff options
author | ab <abdullah_ahmed@brown.edu> | 2019-07-10 13:01:16 -0400 |
---|---|---|
committer | ab <abdullah_ahmed@brown.edu> | 2019-07-10 13:01:16 -0400 |
commit | e3f0338f8f4b223a7e4389aebeb20ede5555510b (patch) | |
tree | 3fb181c0b16161d2c8b1d7f0a7cbcd34f8e1c3c4 /src/server/GarbageCollector.ts | |
parent | 1fb746bcb228a348da1b4d8056aab59e073ee89e (diff) | |
parent | fd8fcfe74fff78bc67b6302f917c53e69d598712 (diff) |
merged
Diffstat (limited to 'src/server/GarbageCollector.ts')
-rw-r--r-- | src/server/GarbageCollector.ts | 124 |
1 files changed, 124 insertions, 0 deletions
diff --git a/src/server/GarbageCollector.ts b/src/server/GarbageCollector.ts new file mode 100644 index 000000000..268239481 --- /dev/null +++ b/src/server/GarbageCollector.ts @@ -0,0 +1,124 @@ +import { Database } from './database'; + +import * as path from 'path'; +import * as fs from 'fs'; +import { Search } from './Search'; + +function addDoc(doc: any, ids: string[], files: { [name: string]: string[] }) { + for (const key in doc) { + if (!doc.hasOwnProperty(key)) { + continue; + } + const field = doc[key]; + if (field === undefined || field === null) { + continue; + } + if (field.__type === "proxy") { + ids.push(field.fieldId); + } else if (field.__type === "list") { + addDoc(field.fields, ids, files); + } else if (typeof field === "string") { + const re = /"(?:dataD|d)ocumentId"\s*:\s*"([\w\-]*)"/g; + let match: string[] | null; + while ((match = re.exec(field)) !== null) { + ids.push(match[1]); + } + } else if (field.__type === "RichTextField") { + const re = /"href"\s*:\s*"(.*?)"/g; + let match: string[] | null; + while ((match = re.exec(field.Data)) !== null) { + const urlString = match[1]; + const split = new URL(urlString).pathname.split("doc/"); + if (split.length > 1) { + ids.push(split[split.length - 1]); + } + } + const re2 = /"src"\s*:\s*"(.*?)"/g; + while ((match = re2.exec(field.Data)) !== null) { + const urlString = match[1]; + const pathname = new URL(urlString).pathname; + const ext = path.extname(pathname); + const fileName = path.basename(pathname, ext); + let exts = files[fileName]; + if (!exts) { + files[fileName] = exts = []; + } + exts.push(ext); + } + } else if (["audio", "image", "video", "pdf", "web"].includes(field.__type)) { + const url = new URL(field.url); + const pathname = url.pathname; + const ext = path.extname(pathname); + const fileName = path.basename(pathname, ext); + let exts = files[fileName]; + if (!exts) { + files[fileName] = exts = []; + } + exts.push(ext); + } + } +} + +async function GarbageCollect() { + // await new Promise(res => setTimeout(res, 3000)); + const cursor = await Database.Instance.query({}, { userDocumentId: 1 }, 'users'); + const users = await cursor.toArray(); + const ids: string[] = users.map(user => user.userDocumentId); + const visited = new Set<string>(); + const files: { [name: string]: string[] } = {}; + + while (ids.length) { + const count = Math.min(ids.length, 100); + const index = ids.length - count; + const fetchIds = ids.splice(index, count).filter(id => !visited.has(id)); + if (!fetchIds.length) { + continue; + } + const docs = await new Promise<{ [key: string]: any }[]>(res => Database.Instance.getDocuments(fetchIds, res, "newDocuments")); + for (const doc of docs) { + const id = doc.id; + if (doc === undefined) { + console.log(`Couldn't find field with Id ${id}`); + continue; + } + visited.add(id); + addDoc(doc.fields, ids, files); + } + console.log(`To Go: ${ids.length}, visited: ${visited.size}`); + } + + console.log(`Done: ${visited.size}`); + + cursor.close(); + + const toDeleteCursor = await Database.Instance.query({ _id: { $nin: Array.from(visited) } }, { _id: 1 }); + const toDelete: string[] = (await toDeleteCursor.toArray()).map(doc => doc._id); + toDeleteCursor.close(); + const result = await Database.Instance.delete({ _id: { $in: toDelete } }, "newDocuments"); + console.log(`${result.deletedCount} documents deleted`); + + await Search.Instance.deleteDocuments(toDelete); + console.log("Cleared search documents"); + + const folder = "./src/server/public/files/"; + fs.readdir(folder, (_, fileList) => { + const filesToDelete = fileList.filter(file => { + const ext = path.extname(file); + let base = path.basename(file, ext); + const existsInDb = (base in files || (base = base.substring(0, base.length - 2)) in files) && files[base].includes(ext); + return file !== ".gitignore" && !existsInDb; + }); + console.log(`Deleting ${filesToDelete.length} files`); + filesToDelete.forEach(file => { + console.log(`Deleting file ${file}`); + try { + fs.unlinkSync(folder + file); + } catch { + console.warn(`Couldn't delete file ${file}`); + } + }); + console.log(`Deleted ${filesToDelete.length} files`); + }); +} + +GarbageCollect(); |