diff options
author | Abdullah Ahmed <abdullah_ahmed@brown.edu> | 2019-10-12 16:00:04 -0400 |
---|---|---|
committer | Abdullah Ahmed <abdullah_ahmed@brown.edu> | 2019-10-12 16:00:04 -0400 |
commit | 814838063b6bbdf8dc813eb601de8da6b4ae0320 (patch) | |
tree | d306f77ad46a9ca4bcf611f02b7f4438551fb863 | |
parent | 1f8bf407ef49aab33294c3e7393718606dfa65dd (diff) |
refactor
-rw-r--r-- | src/client/ClientRecommender.tsx | 74 | ||||
-rw-r--r-- | src/client/cognitive_services/CognitiveServices.ts | 15 | ||||
-rw-r--r-- | src/client/views/nodes/DocumentView.tsx | 2 |
3 files changed, 45 insertions, 46 deletions
diff --git a/src/client/ClientRecommender.tsx b/src/client/ClientRecommender.tsx index bc1cd139c..0e1e8175a 100644 --- a/src/client/ClientRecommender.tsx +++ b/src/client/ClientRecommender.tsx @@ -36,7 +36,7 @@ export class ClientRecommender extends React.Component<RecommenderProps> { private mainDoc?: RecommenderDocument; private docVectors: Set<RecommenderDocument> = new Set(); - @observable private corr_matrix = [[0, 0], [0, 0]]; + @observable private corr_matrix = [[0, 0], [0, 0]]; // for testing constructor(props: RecommenderProps) { //console.log("creating client recommender..."); @@ -90,16 +90,21 @@ export class ClientRecommender extends React.Component<RecommenderProps> { * Returns list of {doc, similarity (to main doc)} in increasing score */ - public computeSimilarities() { + public computeSimilarities(distance_metric: string) { ClientRecommender.Instance.docVectors.forEach((doc: RecommenderDocument) => { if (ClientRecommender.Instance.mainDoc) { - const distance = ClientRecommender.Instance.distance(ClientRecommender.Instance.mainDoc.vectorDoc, doc.vectorDoc, "cosine"); + const distance = ClientRecommender.Instance.distance(ClientRecommender.Instance.mainDoc.vectorDoc, doc.vectorDoc, distance_metric); doc.score = distance; } } ); let doclist = Array.from(ClientRecommender.Instance.docVectors); - doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => a.score - b.score); + if (distance_metric == "euclidian") { + doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => a.score - b.score); + } + else { + doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => b.score - a.score); + } return doclist; } @@ -107,8 +112,8 @@ export class ClientRecommender extends React.Component<RecommenderProps> { * Computes the mean of a set of vectors */ - public mean(paragraph: Set<number[]>, dataDoc: Doc, mainDoc: boolean) { - const n = 200; + public mean(paragraph: Set<number[]>) { + const n = 512; const num_words = paragraph.size; let meanVector = new Array<number>(n).fill(0); // mean vector if (num_words > 0) { // check to see if paragraph actually was vectorized @@ -118,23 +123,20 @@ export class ClientRecommender extends React.Component<RecommenderProps> { } }); meanVector = meanVector.map(x => x / num_words); - const internalDoc: RecommenderDocument = { actualDoc: dataDoc, vectorDoc: meanVector, score: 0 }; - if (mainDoc) ClientRecommender.Instance.mainDoc = internalDoc; - ClientRecommender.Instance.addToDocSet(internalDoc); } return meanVector; } - public processVector(vector: number[], dataDoc: Doc, mainDoc: boolean) { + public processVector(vector: number[], dataDoc: Doc, isMainDoc: boolean) { if (vector.length > 0) { const internalDoc: RecommenderDocument = { actualDoc: dataDoc, vectorDoc: vector, score: 0 }; - if (mainDoc) ClientRecommender.Instance.mainDoc = internalDoc; - ClientRecommender.Instance.addToDocSet(internalDoc); + ClientRecommender.Instance.addToDocSet(internalDoc, isMainDoc); } } - private addToDocSet(internalDoc: RecommenderDocument) { + private addToDocSet(internalDoc: RecommenderDocument, isMainDoc: boolean) { if (ClientRecommender.Instance.docVectors) { + if (isMainDoc) ClientRecommender.Instance.mainDoc = internalDoc; ClientRecommender.Instance.docVectors.add(internalDoc); } } @@ -143,22 +145,25 @@ export class ClientRecommender extends React.Component<RecommenderProps> { * Uses Cognitive Services to extract keywords from a document */ - public async extractText(dataDoc: Doc, extDoc: Doc, internal: boolean = true, mainDoc: boolean = false) { + public async extractText(dataDoc: Doc, extDoc: Doc, internal: boolean = true, isMainDoc: boolean = false) { let fielddata = Cast(dataDoc.data, RichTextField); let data: string; fielddata ? data = fielddata[ToPlainText]() : data = ""; let converter = async (results: any, data: string) => { let keyterms = new List<string>(); // raw keywords - let keyterms_counted = new List<string>(); // keywords, where each keyword is repeated as - let kp_string: string = ""; + // let keyterms_counted = new List<string>(); // keywords, where each keyword is repeated. input to w2v + let kp_string: string = ""; // keywords concatenated into a string. input into TF let highKP: string[] = [""]; // most frequent keyphrase let high = 0; results.documents.forEach((doc: any) => { let keyPhrases = doc.keyPhrases; keyPhrases.map((kp: string) => { - const frequency = this.countFrequencies(kp, data); keyterms.push(kp); - kp_string += kp + ", "; + const frequency = this.countFrequencies(kp, data); // frequency of keyphrase in paragraph + kp_string += kp + ", "; // ensures that if frequency is 0 for some reason kp is still added + for (let i = 0; i < frequency - 1; i++) { + kp_string += kp + ", "; // weights repeated keywords higher + } // replaces highKP with new one if (frequency > high) { high = frequency; @@ -168,24 +173,25 @@ export class ClientRecommender extends React.Component<RecommenderProps> { else if (frequency === high) { highKP.push(kp); } - let words = kp.split(" "); // separates phrase into words - words = this.removeStopWords(words); // removes stop words if they appear in phrases - words.forEach((word) => { - //keyterms.push(word); - for (let i = 0; i < frequency; i++) { - keyterms_counted.push(word); - } - }); + // let words = kp.split(" "); // separates phrase into words + // words = this.removeStopWords(words); // removes stop words if they appear in phrases + // words.forEach((word) => { + // for (let i = 0; i < frequency; i++) { + // keyterms_counted.push(word); + // } + // }); }); }); - const kts_counted = new List<string>(); - keyterms_counted.forEach(kt => kts_counted.push(kt.toLowerCase())); + // const kts_counted = new List<string>(); + // keyterms_counted.forEach(kt => kts_counted.push(kt.toLowerCase())); + if (kp_string.length > 2) kp_string = kp_string.substring(0, kp_string.length - 2); + console.log("kp string: ", kp_string); let values = ""; if (!internal) values = await this.sendRequest(highKP); - return { keyterms: keyterms, keyterms_counted: kts_counted, values, kp_string: [kp_string] }; + return { keyterms: keyterms, external_recommendations: values, kp_string: [kp_string] }; }; if (data != "") { - return CognitiveServices.Text.Appliers.analyzer(dataDoc, extDoc, ["key words"], data, converter, mainDoc, internal); + return CognitiveServices.Text.Appliers.analyzer(dataDoc, extDoc, ["key words"], data, converter, isMainDoc, internal); } return; } @@ -196,7 +202,7 @@ export class ClientRecommender extends React.Component<RecommenderProps> { */ private countFrequencies(keyphrase: string, paragraph: string) { - let data = paragraph.split(" "); + let data = paragraph.split(/ |\n/); // splits by new lines and spaces let kp_array = keyphrase.split(" "); let num_keywords = kp_array.length; let par_length = data.length; @@ -289,12 +295,6 @@ export class ClientRecommender extends React.Component<RecommenderProps> { return new Promise<any>(promisified); } - processArxivResult = (result: any) => { - var xmlDoc = result as XMLDocument; - let text = xmlDoc.getElementsByTagName("title")[0].childNodes[0].nodeValue; - console.log(text); - } - render() { return (<div className="wrapper"> </div>); diff --git a/src/client/cognitive_services/CognitiveServices.ts b/src/client/cognitive_services/CognitiveServices.ts index eb088763d..48519f916 100644 --- a/src/client/cognitive_services/CognitiveServices.ts +++ b/src/client/cognitive_services/CognitiveServices.ts @@ -15,7 +15,7 @@ type RequestExecutor = (apiKey: string, body: string, service: Service) => Promi type AnalysisApplier<D> = (target: Doc, relevantKeys: string[], data: D, ...args: any) => any; type BodyConverter<D> = (data: D) => string; type Converter = (results: any) => Field; -type TextConverter = (results: any, data: string) => Promise<{ keyterms: Field, keyterms_counted: Field, values: any, kp_string: string[] }>; +type TextConverter = (results: any, data: string) => Promise<{ keyterms: Field, external_recommendations: any, kp_string: string[] }>; export type Tag = { name: string, confidence: number }; export type Rectangle = { top: number, left: number, width: number, height: number }; @@ -264,7 +264,7 @@ export namespace CognitiveServices { export namespace Appliers { - export async function vectorize(keyterms: any, dataDoc: Doc, mainDoc: boolean = false, data: string) { + export async function vectorize(keyterms: any, dataDoc: Doc, mainDoc: boolean = false) { console.log("vectorizing..."); //keyterms = ["father", "king"]; let args = { method: 'POST', uri: Utils.prepend("/recommender"), body: { keyphrases: keyterms }, json: true }; @@ -287,17 +287,16 @@ export namespace CognitiveServices { ); } - export const analyzer = async (dataDoc: Doc, target: Doc, keys: string[], data: string, converter: TextConverter, mainDoc: boolean = false, internal: boolean = true) => { + export const analyzer = async (dataDoc: Doc, target: Doc, keys: string[], data: string, converter: TextConverter, isMainDoc: boolean = false, internal: boolean = true) => { let results = await ExecuteQuery(Service.Text, Manager, data); console.log(results); - let { keyterms, values, keyterms_counted, kp_string } = await converter(results, data); - //target[keys[0]] = Docs.Get.DocumentHierarchyFromJson(results, "Key Word Analysis"); + let { keyterms, external_recommendations, kp_string } = await converter(results, data); target[keys[0]] = keyterms; - console.log("analyzed!"); if (internal) { - await vectorize(kp_string, dataDoc, mainDoc, data); + //await vectorize([data], dataDoc, isMainDoc); + await vectorize(kp_string, dataDoc, isMainDoc); } else { - return values; + return external_recommendations; } }; diff --git a/src/client/views/nodes/DocumentView.tsx b/src/client/views/nodes/DocumentView.tsx index 070b1f426..ab2717eed 100644 --- a/src/client/views/nodes/DocumentView.tsx +++ b/src/client/views/nodes/DocumentView.tsx @@ -602,7 +602,7 @@ export class DocumentView extends DocComponent<DocumentViewProps, Document>(Docu } } })); - const doclist = ClientRecommender.Instance.computeSimilarities(); + const doclist = ClientRecommender.Instance.computeSimilarities("cosine"); let recDocs: { preview: Doc, score: number }[] = []; // tslint:disable-next-line: prefer-for-of for (let i = 0; i < doclist.length; i++) { |