diff options
author | Bob Zeleznik <zzzman@gmail.com> | 2019-09-29 22:27:03 -0400 |
---|---|---|
committer | Bob Zeleznik <zzzman@gmail.com> | 2019-09-29 22:27:03 -0400 |
commit | 0ae9a7f6acbdf6ecade8d349981e8d6badef7ff9 (patch) | |
tree | 042d4825aefb8cda39c32e23f2050ac5ae03fb8d | |
parent | c40480ec1a2da84b4223b0605bea2fe19df1104c (diff) |
added pdf text searching
-rw-r--r-- | package.json | 2 | ||||
-rw-r--r-- | src/client/util/SearchUtil.ts | 27 | ||||
-rw-r--r-- | src/client/views/collections/CollectionSubView.tsx | 8 | ||||
-rw-r--r-- | src/client/views/search/SearchBox.tsx | 15 | ||||
-rw-r--r-- | src/client/views/search/SearchItem.tsx | 3 | ||||
-rw-r--r-- | src/server/index.ts | 47 |
6 files changed, 86 insertions, 16 deletions
diff --git a/package.json b/package.json index b373ee4a5..307283214 100644 --- a/package.json +++ b/package.json @@ -133,6 +133,7 @@ "express-session": "^1.15.6", "express-validator": "^5.3.1", "expressjs": "^1.0.1", + "find-in-files": "^0.5.0", "flexlayout-react": "^0.3.3", "font-awesome": "^4.7.0", "formidable": "^1.2.1", @@ -165,6 +166,7 @@ "p-limit": "^2.2.0", "passport": "^0.4.0", "passport-local": "^1.0.0", + "pdf-parse": "^1.1.1", "pdfjs-dist": "^2.0.943", "probe-image-size": "^4.0.0", "prosemirror-commands": "^1.0.8", diff --git a/src/client/util/SearchUtil.ts b/src/client/util/SearchUtil.ts index ee5a83710..d8b9dbec6 100644 --- a/src/client/util/SearchUtil.ts +++ b/src/client/util/SearchUtil.ts @@ -3,18 +3,22 @@ import { DocServer } from '../DocServer'; import { Doc } from '../../new_fields/Doc'; import { Id } from '../../new_fields/FieldSymbols'; import { Utils } from '../../Utils'; +import { ResultParameters } from '../northstar/model/idea/idea'; +import { DocumentType } from '../documents/DocumentTypes'; export namespace SearchUtil { export type HighlightingResult = { [id: string]: { [key: string]: string[] } }; export interface IdSearchResult { ids: string[]; + lines: string[][]; numFound: number; highlighting: HighlightingResult | undefined; } export interface DocSearchResult { docs: Doc[]; + lines: string[][]; numFound: number; highlighting: HighlightingResult | undefined; } @@ -30,16 +34,31 @@ export namespace SearchUtil { export function Search(query: string, returnDocs: false, options?: SearchParams): Promise<IdSearchResult>; export async function Search(query: string, returnDocs: boolean, options: SearchParams = {}) { query = query || "*"; //If we just have a filter query, search for * as the query - const result: IdSearchResult = JSON.parse(await rp.get(Utils.prepend("/search"), { + let result: IdSearchResult = JSON.parse(await rp.get(Utils.prepend("/search"), { qs: { ...options, q: query }, })); if (!returnDocs) { return result; } - const { ids, numFound, highlighting } = result; + + let { ids, numFound, highlighting } = result; + let lines: string[][] = ids.map(i => []); + + let txtresult = query !== "*" && JSON.parse(await rp.get(Utils.prepend("/textsearch"), { + qs: { ...options, q: query }, + })); + let fileids = txtresult ? txtresult.ids : []; + await Promise.all(fileids.map(async (tr: string, i: number) => { + let docQuery = "fileUpload_t:" + tr.substr(0, 7); //If we just have a filter query, search for * as the query + let docResult = JSON.parse(await rp.get(Utils.prepend("/search"), { qs: { ...options, q: docQuery } })); + ids.push(...docResult.ids); + lines.push(...docResult.ids.map((dr: any) => txtresult.lines[i])); + numFound += docResult.numFound; + })); + const docMap = await DocServer.GetRefFields(ids); - const docs = ids.map((id: string) => docMap[id]).filter((doc: any) => doc instanceof Doc); - return { docs, numFound, highlighting }; + const docs = ids.map((id: string) => docMap[id]).filter((doc: any) => doc instanceof Doc && doc.type !== DocumentType.KVP); + return { docs, numFound, highlighting, lines }; } export async function GetAliasesOfDocument(doc: Doc): Promise<Doc[]>; diff --git a/src/client/views/collections/CollectionSubView.tsx b/src/client/views/collections/CollectionSubView.tsx index 954a27cbd..cdeba6e16 100644 --- a/src/client/views/collections/CollectionSubView.tsx +++ b/src/client/views/collections/CollectionSubView.tsx @@ -22,6 +22,7 @@ import { CollectionPDFView } from "./CollectionPDFView"; import { CollectionVideoView } from "./CollectionVideoView"; import { CollectionView } from "./CollectionView"; import React = require("react"); +var path = require('path'); export interface CollectionViewProps extends FieldViewProps { addDocument: (document: Doc, allowDuplicates?: boolean) => boolean; @@ -261,8 +262,11 @@ export function CollectionSubView<T>(schemaCtor: (doc: Doc) => T) { }).then(async (res: Response) => { (await res.json()).map(action((file: any) => { let full = { ...options, nativeWidth: type.indexOf("video") !== -1 ? 600 : 300, width: 300, title: dropFileName }; - let path = Utils.prepend(file); - Docs.Get.DocumentFromType(type, path, full).then(doc => doc && this.props.addDocument(doc)); + let pathname = Utils.prepend(file); + Docs.Get.DocumentFromType(type, pathname, full).then(doc => { + doc && (doc.fileUpload = path.basename(pathname).replace("upload_", "").replace(/\.[a-z0-9]*$/, "")); + doc && this.props.addDocument(doc); + }); })); }); promises.push(prom); diff --git a/src/client/views/search/SearchBox.tsx b/src/client/views/search/SearchBox.tsx index 0d50124dd..f53270c64 100644 --- a/src/client/views/search/SearchBox.tsx +++ b/src/client/views/search/SearchBox.tsx @@ -18,6 +18,7 @@ import { FilterBox } from './FilterBox'; import "./FilterBox.scss"; import "./SearchBox.scss"; import { SearchItem } from './SearchItem'; +import { string } from 'prop-types'; library.add(faTimes); @@ -27,7 +28,7 @@ export class SearchBox extends React.Component { @observable private _searchString: string = ""; @observable private _resultsOpen: boolean = false; @observable private _searchbarOpen: boolean = false; - @observable private _results: [Doc, string[]][] = []; + @observable private _results: [Doc, string[], string[]][] = []; private _resultsSet = new Map<Doc, number>(); @observable private _openNoResults: boolean = false; @observable private _visibleElements: JSX.Element[] = []; @@ -159,6 +160,8 @@ export class SearchBox extends React.Component { const highlighting = res.highlighting || {}; const highlightList = res.docs.map(doc => highlighting[doc[Id]]); + const lines = new Map<string, string[]>(); + res.docs.map((doc, i) => lines.set(doc[Id], res.lines[i])); const docs = await Promise.all(res.docs.map(async doc => (await Cast(doc.extendsDoc, Doc)) || doc)); const highlights: typeof res.highlighting = {}; docs.forEach((doc, index) => highlights[doc[Id]] = highlightList[index]); @@ -168,12 +171,14 @@ export class SearchBox extends React.Component { filteredDocs.forEach(doc => { const index = this._resultsSet.get(doc); const highlight = highlights[doc[Id]]; + const line = lines.get(doc[Id]) || []; const hlights = highlight ? Object.keys(highlight).map(key => key.substring(0, key.length - 2)) : []; if (index === undefined) { this._resultsSet.set(doc, this._results.length); - this._results.push([doc, hlights]); + this._results.push([doc, hlights, line]); } else { this._results[index][1].push(...hlights); + this._results[index][2].push(...line); } }); }); @@ -296,13 +301,13 @@ export class SearchBox extends React.Component { } else { if (this._isSearch[i] !== "search") { - let result: [Doc, string[]] | undefined = undefined; + let result: [Doc, string[], string[]] | undefined = undefined; if (i >= this._results.length) { this.getResults(this._searchString); if (i < this._results.length) result = this._results[i]; if (result) { let highlights = Array.from([...Array.from(new Set(result[1]).values())]).filter(v => v !== "search_string"); - this._visibleElements[i] = <SearchItem doc={result[0]} query={this._searchString} key={result[0][Id]} highlighting={highlights} />; + this._visibleElements[i] = <SearchItem doc={result[0]} query={this._searchString} key={result[0][Id]} lines={result[2]} highlighting={highlights} />; this._isSearch[i] = "search"; } } @@ -310,7 +315,7 @@ export class SearchBox extends React.Component { result = this._results[i]; if (result) { let highlights = Array.from([...Array.from(new Set(result[1]).values())]).filter(v => v !== "search_string"); - this._visibleElements[i] = <SearchItem doc={result[0]} query={this._searchString} key={result[0][Id]} highlighting={highlights} />; + this._visibleElements[i] = <SearchItem doc={result[0]} query={this._searchString} key={result[0][Id]} lines={result[2]} highlighting={highlights} />; this._isSearch[i] = "search"; } } diff --git a/src/client/views/search/SearchItem.tsx b/src/client/views/search/SearchItem.tsx index 96eefacc2..4d021216d 100644 --- a/src/client/views/search/SearchItem.tsx +++ b/src/client/views/search/SearchItem.tsx @@ -30,6 +30,7 @@ export interface SearchItemProps { doc: Doc; query: string; highlighting: string[]; + lines: string[]; } library.add(faCaretUp); @@ -288,7 +289,7 @@ export class SearchItem extends React.Component<SearchItemProps> { <div title="Drag as document" onPointerDown={this.onPointerDown} style={{ marginRight: "7px" }}> <FontAwesomeIcon icon="file" size="lg" /> </div> <div className="search-title-container"> <div className="search-title">{StrCast(this.props.doc.title)}</div> - <div className="search-highlighting">Matched fields: {this.props.highlighting.join(", ")}</div> + <div className="search-highlighting">{this.props.highlighting.length ? "Matched fields:" + this.props.highlighting.join(", ") : this.props.lines.length ? "Text:" + this.props.lines[0] : ""}</div> </div> <div className="search-info" style={{ width: this._useIcons ? "15%" : "400px" }}> <div className={`icon-${this._useIcons ? "icons" : "live"}`}> diff --git a/src/server/index.ts b/src/server/index.ts index 50ce2b14e..524407a83 100644 --- a/src/server/index.ts +++ b/src/server/index.ts @@ -42,13 +42,11 @@ var AdmZip = require('adm-zip'); import * as YoutubeApi from "./apis/youtube/youtubeApiSample"; import { Response } from 'express-serve-static-core'; import { GoogleApiServerUtils } from "./apis/google/GoogleApiServerUtils"; -import { GaxiosResponse } from 'gaxios'; -import { Opt } from '../new_fields/Doc'; -import { docs_v1 } from 'googleapis'; -import { Endpoint } from 'googleapis-common'; const MongoStore = require('connect-mongo')(session); const mongoose = require('mongoose'); const probe = require("probe-image-size"); +const pdf = require('pdf-parse'); +var findInFiles = require('find-in-files'); const download = (url: string, dest: fs.PathLike) => request.get(url).pipe(fs.createWriteStream(dest)); let youtubeApiKey: string; @@ -196,6 +194,23 @@ const solrURL = "http://localhost:8983/solr/#/dash"; // GETTERS +app.get("/textsearch", async (req, res) => { + let q = req.query.q; + console.log("TEXTSEARCH " + q); + if (q === undefined) { + res.send([]); + return; + } + let results = await findInFiles.find({ 'term': q, 'flags': 'ig' }, uploadDir + "text", ".txt$"); + let resObj: { ids: string[], numFound: number, lines: string[] } = { ids: [], numFound: 0, lines: [] }; + for (var result in results) { + resObj.ids.push(path.basename(result, ".txt").replace(/upload_/, "")); + resObj.lines.push(results[result].line); + resObj.numFound++; + } + res.send(resObj); +}); + app.get("/search", async (req, res) => { const solrQuery: any = {}; ["q", "fq", "start", "rows", "hl", "hl.fl"].forEach(key => solrQuery[key] = req.query[key]); @@ -597,6 +612,30 @@ app.post( fs.createReadStream(uploadDir + file).pipe(resizer.resizer).pipe(fs.createWriteStream(uploadDir + file.substring(0, file.length - ext.length) + resizer.suffix + ext)); }); } + if (ext.endsWith("pdf")) { + var filePath = uploadDir + file; + + let dataBuffer = fs.readFileSync(filePath); + + pdf(dataBuffer).then(async function (data: any) { + + // number of pages + // console.log(data.numpages); + // // number of rendered pages + // console.log(data.numrender); + // // PDF info + // console.log(data.info); + // // PDF metadata + // console.log(data.metadata); + // // PDF.js version + // // check https://mozilla.github.io/pdf.js/getting_started/ + // console.log(data.version); + // // PDF text + // console.log(data.text); + fs.createWriteStream(uploadDir + "text/" + file.substring(0, file.length - ext.length) + ".txt").write(data.text); + }); + + } names.push(`/files/` + file); } res.send(names); |