aboutsummaryrefslogtreecommitdiff
path: root/src/scraping
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping')
-rw-r--r--src/scraping/buxton/final/BuxtonImporter.ts39
1 files changed, 23 insertions, 16 deletions
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts
index 122415460..713207a07 100644
--- a/src/scraping/buxton/final/BuxtonImporter.ts
+++ b/src/scraping/buxton/final/BuxtonImporter.ts
@@ -16,6 +16,7 @@ interface DocumentContents {
hyperlinks: string[];
captions: string[];
embeddedFileNames: string[];
+ longDescription: string;
}
export interface DeviceDocument {
@@ -186,10 +187,6 @@ const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([
exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/,
transformer: Utilities.correctSentences
}],
- ["longDescription", {
- exp: /Bill Buxton[’']s Notes(.*)Device Details/,
- transformer: Utilities.correctSentences
- }],
]);
const sourceDir = path.resolve(__dirname, "source");
@@ -267,7 +264,12 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
const body = document.root()?.text() ?? "No body found. Check the import script's XML parser.";
const captions: string[] = [];
const embeddedFileNames: string[] = [];
- const captionTargets = document.find(tableCellXPath).map(node => node.text());
+ const captionTargets = document.find(tableCellXPath).map(node => node.text().trim());
+
+ const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!);
+ const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1;
+ const end = paragraphs.indexOf("Device Details");
+ const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n");
const { length } = captionTargets;
strictEqual(length > 3, true, "No captions written.");
@@ -290,7 +292,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
zip.close();
- return { body, imageData, captions, embeddedFileNames, hyperlinks };
+ return { body, longDescription, imageData, captions, embeddedFileNames, hyperlinks };
}
const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/;
@@ -306,26 +308,30 @@ async function writeImages(zip: any): Promise<ImageData[]> {
const imageEntries = allEntries.filter(name => imageEntry.test(name));
const imageUrls: ImageData[] = [];
- for (const mediaPath of imageEntries) {
- const getImageStream = () => new Promise<Readable>((resolve, reject) => {
- zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream));
- });
+ const valid: any[] = [];
+ const getImageStream = (mediaPath: string) => new Promise<Readable>((resolve, reject) => {
+ zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream));
+ });
+
+ for (const mediaPath of imageEntries) {
const { width, height, type } = await new Promise<Dimensions>(async resolve => {
const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: Dimensions) => {
readStream.destroy();
resolve(dimensions);
}).on("error", () => readStream.destroy());
- const readStream = await getImageStream();
+ const readStream = await getImageStream(mediaPath);
readStream.pipe(sizeStream);
});
- if (Math.abs(width - height) < 10) {
- continue;
+
+ if (Math.abs(width - height) > 10) {
+ valid.push({ width, height, type, mediaPath });
}
+ }
+ for (const { type, width, height, mediaPath } of valid) {
const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`;
- await DashUploadUtils.outputResizedImages(getImageStream, generatedFileName, imageDir);
-
+ await DashUploadUtils.outputResizedImages(() => getImageStream(mediaPath), generatedFileName, imageDir);
imageUrls.push({
url: `/files/images/buxton/${generatedFileName}`,
nativeWidth: width,
@@ -337,11 +343,12 @@ async function writeImages(zip: any): Promise<ImageData[]> {
}
function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
- const { body, imageData, captions, hyperlinks, embeddedFileNames } = contents;
+ const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents;
const device: any = {
hyperlinks,
captions,
embeddedFileNames,
+ longDescription,
__images: imageData
};
const errors: { [key: string]: string } = { fileName };