aboutsummaryrefslogtreecommitdiff
path: root/src/scraping
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping')
-rw-r--r--src/scraping/buxton/final/BuxtonImporter.ts17
1 files changed, 10 insertions, 7 deletions
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts
index 122415460..64b988610 100644
--- a/src/scraping/buxton/final/BuxtonImporter.ts
+++ b/src/scraping/buxton/final/BuxtonImporter.ts
@@ -16,6 +16,7 @@ interface DocumentContents {
hyperlinks: string[];
captions: string[];
embeddedFileNames: string[];
+ longDescriptionParagraphs: string[];
}
export interface DeviceDocument {
@@ -186,10 +187,6 @@ const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([
exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/,
transformer: Utilities.correctSentences
}],
- ["longDescription", {
- exp: /Bill Buxton[’']s Notes(.*)Device Details/,
- transformer: Utilities.correctSentences
- }],
]);
const sourceDir = path.resolve(__dirname, "source");
@@ -267,7 +264,12 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
const body = document.root()?.text() ?? "No body found. Check the import script's XML parser.";
const captions: string[] = [];
const embeddedFileNames: string[] = [];
- const captionTargets = document.find(tableCellXPath).map(node => node.text());
+ const captionTargets = document.find(tableCellXPath).map(node => node.text().trim());
+
+ const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!);
+ const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1;
+ const end = paragraphs.indexOf("Device Details");
+ const longDescriptionParagraphs = paragraphs.slice(start, end);
const { length } = captionTargets;
strictEqual(length > 3, true, "No captions written.");
@@ -290,7 +292,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
zip.close();
- return { body, imageData, captions, embeddedFileNames, hyperlinks };
+ return { body, longDescriptionParagraphs, imageData, captions, embeddedFileNames, hyperlinks };
}
const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/;
@@ -337,7 +339,7 @@ async function writeImages(zip: any): Promise<ImageData[]> {
}
function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
- const { body, imageData, captions, hyperlinks, embeddedFileNames } = contents;
+ const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescriptionParagraphs } = contents;
const device: any = {
hyperlinks,
captions,
@@ -376,6 +378,7 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
return { errors };
}
+ device.longDescription = longDescriptionParagraphs.join("\n\n");
return { device };
}