aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Zeleznik <zzzman@gmail.com>2019-09-29 22:27:03 -0400
committerBob Zeleznik <zzzman@gmail.com>2019-09-29 22:27:03 -0400
commit0ae9a7f6acbdf6ecade8d349981e8d6badef7ff9 (patch)
tree042d4825aefb8cda39c32e23f2050ac5ae03fb8d
parentc40480ec1a2da84b4223b0605bea2fe19df1104c (diff)
added pdf text searching
-rw-r--r--package.json2
-rw-r--r--src/client/util/SearchUtil.ts27
-rw-r--r--src/client/views/collections/CollectionSubView.tsx8
-rw-r--r--src/client/views/search/SearchBox.tsx15
-rw-r--r--src/client/views/search/SearchItem.tsx3
-rw-r--r--src/server/index.ts47
6 files changed, 86 insertions, 16 deletions
diff --git a/package.json b/package.json
index b373ee4a5..307283214 100644
--- a/package.json
+++ b/package.json
@@ -133,6 +133,7 @@
"express-session": "^1.15.6",
"express-validator": "^5.3.1",
"expressjs": "^1.0.1",
+ "find-in-files": "^0.5.0",
"flexlayout-react": "^0.3.3",
"font-awesome": "^4.7.0",
"formidable": "^1.2.1",
@@ -165,6 +166,7 @@
"p-limit": "^2.2.0",
"passport": "^0.4.0",
"passport-local": "^1.0.0",
+ "pdf-parse": "^1.1.1",
"pdfjs-dist": "^2.0.943",
"probe-image-size": "^4.0.0",
"prosemirror-commands": "^1.0.8",
diff --git a/src/client/util/SearchUtil.ts b/src/client/util/SearchUtil.ts
index ee5a83710..d8b9dbec6 100644
--- a/src/client/util/SearchUtil.ts
+++ b/src/client/util/SearchUtil.ts
@@ -3,18 +3,22 @@ import { DocServer } from '../DocServer';
import { Doc } from '../../new_fields/Doc';
import { Id } from '../../new_fields/FieldSymbols';
import { Utils } from '../../Utils';
+import { ResultParameters } from '../northstar/model/idea/idea';
+import { DocumentType } from '../documents/DocumentTypes';
export namespace SearchUtil {
export type HighlightingResult = { [id: string]: { [key: string]: string[] } };
export interface IdSearchResult {
ids: string[];
+ lines: string[][];
numFound: number;
highlighting: HighlightingResult | undefined;
}
export interface DocSearchResult {
docs: Doc[];
+ lines: string[][];
numFound: number;
highlighting: HighlightingResult | undefined;
}
@@ -30,16 +34,31 @@ export namespace SearchUtil {
export function Search(query: string, returnDocs: false, options?: SearchParams): Promise<IdSearchResult>;
export async function Search(query: string, returnDocs: boolean, options: SearchParams = {}) {
query = query || "*"; //If we just have a filter query, search for * as the query
- const result: IdSearchResult = JSON.parse(await rp.get(Utils.prepend("/search"), {
+ let result: IdSearchResult = JSON.parse(await rp.get(Utils.prepend("/search"), {
qs: { ...options, q: query },
}));
if (!returnDocs) {
return result;
}
- const { ids, numFound, highlighting } = result;
+
+ let { ids, numFound, highlighting } = result;
+ let lines: string[][] = ids.map(i => []);
+
+ let txtresult = query !== "*" && JSON.parse(await rp.get(Utils.prepend("/textsearch"), {
+ qs: { ...options, q: query },
+ }));
+ let fileids = txtresult ? txtresult.ids : [];
+ await Promise.all(fileids.map(async (tr: string, i: number) => {
+ let docQuery = "fileUpload_t:" + tr.substr(0, 7); //If we just have a filter query, search for * as the query
+ let docResult = JSON.parse(await rp.get(Utils.prepend("/search"), { qs: { ...options, q: docQuery } }));
+ ids.push(...docResult.ids);
+ lines.push(...docResult.ids.map((dr: any) => txtresult.lines[i]));
+ numFound += docResult.numFound;
+ }));
+
const docMap = await DocServer.GetRefFields(ids);
- const docs = ids.map((id: string) => docMap[id]).filter((doc: any) => doc instanceof Doc);
- return { docs, numFound, highlighting };
+ const docs = ids.map((id: string) => docMap[id]).filter((doc: any) => doc instanceof Doc && doc.type !== DocumentType.KVP);
+ return { docs, numFound, highlighting, lines };
}
export async function GetAliasesOfDocument(doc: Doc): Promise<Doc[]>;
diff --git a/src/client/views/collections/CollectionSubView.tsx b/src/client/views/collections/CollectionSubView.tsx
index 954a27cbd..cdeba6e16 100644
--- a/src/client/views/collections/CollectionSubView.tsx
+++ b/src/client/views/collections/CollectionSubView.tsx
@@ -22,6 +22,7 @@ import { CollectionPDFView } from "./CollectionPDFView";
import { CollectionVideoView } from "./CollectionVideoView";
import { CollectionView } from "./CollectionView";
import React = require("react");
+var path = require('path');
export interface CollectionViewProps extends FieldViewProps {
addDocument: (document: Doc, allowDuplicates?: boolean) => boolean;
@@ -261,8 +262,11 @@ export function CollectionSubView<T>(schemaCtor: (doc: Doc) => T) {
}).then(async (res: Response) => {
(await res.json()).map(action((file: any) => {
let full = { ...options, nativeWidth: type.indexOf("video") !== -1 ? 600 : 300, width: 300, title: dropFileName };
- let path = Utils.prepend(file);
- Docs.Get.DocumentFromType(type, path, full).then(doc => doc && this.props.addDocument(doc));
+ let pathname = Utils.prepend(file);
+ Docs.Get.DocumentFromType(type, pathname, full).then(doc => {
+ doc && (doc.fileUpload = path.basename(pathname).replace("upload_", "").replace(/\.[a-z0-9]*$/, ""));
+ doc && this.props.addDocument(doc);
+ });
}));
});
promises.push(prom);
diff --git a/src/client/views/search/SearchBox.tsx b/src/client/views/search/SearchBox.tsx
index 0d50124dd..f53270c64 100644
--- a/src/client/views/search/SearchBox.tsx
+++ b/src/client/views/search/SearchBox.tsx
@@ -18,6 +18,7 @@ import { FilterBox } from './FilterBox';
import "./FilterBox.scss";
import "./SearchBox.scss";
import { SearchItem } from './SearchItem';
+import { string } from 'prop-types';
library.add(faTimes);
@@ -27,7 +28,7 @@ export class SearchBox extends React.Component {
@observable private _searchString: string = "";
@observable private _resultsOpen: boolean = false;
@observable private _searchbarOpen: boolean = false;
- @observable private _results: [Doc, string[]][] = [];
+ @observable private _results: [Doc, string[], string[]][] = [];
private _resultsSet = new Map<Doc, number>();
@observable private _openNoResults: boolean = false;
@observable private _visibleElements: JSX.Element[] = [];
@@ -159,6 +160,8 @@ export class SearchBox extends React.Component {
const highlighting = res.highlighting || {};
const highlightList = res.docs.map(doc => highlighting[doc[Id]]);
+ const lines = new Map<string, string[]>();
+ res.docs.map((doc, i) => lines.set(doc[Id], res.lines[i]));
const docs = await Promise.all(res.docs.map(async doc => (await Cast(doc.extendsDoc, Doc)) || doc));
const highlights: typeof res.highlighting = {};
docs.forEach((doc, index) => highlights[doc[Id]] = highlightList[index]);
@@ -168,12 +171,14 @@ export class SearchBox extends React.Component {
filteredDocs.forEach(doc => {
const index = this._resultsSet.get(doc);
const highlight = highlights[doc[Id]];
+ const line = lines.get(doc[Id]) || [];
const hlights = highlight ? Object.keys(highlight).map(key => key.substring(0, key.length - 2)) : [];
if (index === undefined) {
this._resultsSet.set(doc, this._results.length);
- this._results.push([doc, hlights]);
+ this._results.push([doc, hlights, line]);
} else {
this._results[index][1].push(...hlights);
+ this._results[index][2].push(...line);
}
});
});
@@ -296,13 +301,13 @@ export class SearchBox extends React.Component {
}
else {
if (this._isSearch[i] !== "search") {
- let result: [Doc, string[]] | undefined = undefined;
+ let result: [Doc, string[], string[]] | undefined = undefined;
if (i >= this._results.length) {
this.getResults(this._searchString);
if (i < this._results.length) result = this._results[i];
if (result) {
let highlights = Array.from([...Array.from(new Set(result[1]).values())]).filter(v => v !== "search_string");
- this._visibleElements[i] = <SearchItem doc={result[0]} query={this._searchString} key={result[0][Id]} highlighting={highlights} />;
+ this._visibleElements[i] = <SearchItem doc={result[0]} query={this._searchString} key={result[0][Id]} lines={result[2]} highlighting={highlights} />;
this._isSearch[i] = "search";
}
}
@@ -310,7 +315,7 @@ export class SearchBox extends React.Component {
result = this._results[i];
if (result) {
let highlights = Array.from([...Array.from(new Set(result[1]).values())]).filter(v => v !== "search_string");
- this._visibleElements[i] = <SearchItem doc={result[0]} query={this._searchString} key={result[0][Id]} highlighting={highlights} />;
+ this._visibleElements[i] = <SearchItem doc={result[0]} query={this._searchString} key={result[0][Id]} lines={result[2]} highlighting={highlights} />;
this._isSearch[i] = "search";
}
}
diff --git a/src/client/views/search/SearchItem.tsx b/src/client/views/search/SearchItem.tsx
index 96eefacc2..4d021216d 100644
--- a/src/client/views/search/SearchItem.tsx
+++ b/src/client/views/search/SearchItem.tsx
@@ -30,6 +30,7 @@ export interface SearchItemProps {
doc: Doc;
query: string;
highlighting: string[];
+ lines: string[];
}
library.add(faCaretUp);
@@ -288,7 +289,7 @@ export class SearchItem extends React.Component<SearchItemProps> {
<div title="Drag as document" onPointerDown={this.onPointerDown} style={{ marginRight: "7px" }}> <FontAwesomeIcon icon="file" size="lg" /> </div>
<div className="search-title-container">
<div className="search-title">{StrCast(this.props.doc.title)}</div>
- <div className="search-highlighting">Matched fields: {this.props.highlighting.join(", ")}</div>
+ <div className="search-highlighting">{this.props.highlighting.length ? "Matched fields:" + this.props.highlighting.join(", ") : this.props.lines.length ? "Text:" + this.props.lines[0] : ""}</div>
</div>
<div className="search-info" style={{ width: this._useIcons ? "15%" : "400px" }}>
<div className={`icon-${this._useIcons ? "icons" : "live"}`}>
diff --git a/src/server/index.ts b/src/server/index.ts
index 50ce2b14e..524407a83 100644
--- a/src/server/index.ts
+++ b/src/server/index.ts
@@ -42,13 +42,11 @@ var AdmZip = require('adm-zip');
import * as YoutubeApi from "./apis/youtube/youtubeApiSample";
import { Response } from 'express-serve-static-core';
import { GoogleApiServerUtils } from "./apis/google/GoogleApiServerUtils";
-import { GaxiosResponse } from 'gaxios';
-import { Opt } from '../new_fields/Doc';
-import { docs_v1 } from 'googleapis';
-import { Endpoint } from 'googleapis-common';
const MongoStore = require('connect-mongo')(session);
const mongoose = require('mongoose');
const probe = require("probe-image-size");
+const pdf = require('pdf-parse');
+var findInFiles = require('find-in-files');
const download = (url: string, dest: fs.PathLike) => request.get(url).pipe(fs.createWriteStream(dest));
let youtubeApiKey: string;
@@ -196,6 +194,23 @@ const solrURL = "http://localhost:8983/solr/#/dash";
// GETTERS
+app.get("/textsearch", async (req, res) => {
+ let q = req.query.q;
+ console.log("TEXTSEARCH " + q);
+ if (q === undefined) {
+ res.send([]);
+ return;
+ }
+ let results = await findInFiles.find({ 'term': q, 'flags': 'ig' }, uploadDir + "text", ".txt$");
+ let resObj: { ids: string[], numFound: number, lines: string[] } = { ids: [], numFound: 0, lines: [] };
+ for (var result in results) {
+ resObj.ids.push(path.basename(result, ".txt").replace(/upload_/, ""));
+ resObj.lines.push(results[result].line);
+ resObj.numFound++;
+ }
+ res.send(resObj);
+});
+
app.get("/search", async (req, res) => {
const solrQuery: any = {};
["q", "fq", "start", "rows", "hl", "hl.fl"].forEach(key => solrQuery[key] = req.query[key]);
@@ -597,6 +612,30 @@ app.post(
fs.createReadStream(uploadDir + file).pipe(resizer.resizer).pipe(fs.createWriteStream(uploadDir + file.substring(0, file.length - ext.length) + resizer.suffix + ext));
});
}
+ if (ext.endsWith("pdf")) {
+ var filePath = uploadDir + file;
+
+ let dataBuffer = fs.readFileSync(filePath);
+
+ pdf(dataBuffer).then(async function (data: any) {
+
+ // number of pages
+ // console.log(data.numpages);
+ // // number of rendered pages
+ // console.log(data.numrender);
+ // // PDF info
+ // console.log(data.info);
+ // // PDF metadata
+ // console.log(data.metadata);
+ // // PDF.js version
+ // // check https://mozilla.github.io/pdf.js/getting_started/
+ // console.log(data.version);
+ // // PDF text
+ // console.log(data.text);
+ fs.createWriteStream(uploadDir + "text/" + file.substring(0, file.length - ext.length) + ".txt").write(data.text);
+ });
+
+ }
names.push(`/files/` + file);
}
res.send(names);