aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAbdullah Ahmed <abdullah_ahmed@brown.edu>2019-10-12 16:00:04 -0400
committerAbdullah Ahmed <abdullah_ahmed@brown.edu>2019-10-12 16:00:04 -0400
commit814838063b6bbdf8dc813eb601de8da6b4ae0320 (patch)
treed306f77ad46a9ca4bcf611f02b7f4438551fb863
parent1f8bf407ef49aab33294c3e7393718606dfa65dd (diff)
refactor
-rw-r--r--src/client/ClientRecommender.tsx74
-rw-r--r--src/client/cognitive_services/CognitiveServices.ts15
-rw-r--r--src/client/views/nodes/DocumentView.tsx2
3 files changed, 45 insertions, 46 deletions
diff --git a/src/client/ClientRecommender.tsx b/src/client/ClientRecommender.tsx
index bc1cd139c..0e1e8175a 100644
--- a/src/client/ClientRecommender.tsx
+++ b/src/client/ClientRecommender.tsx
@@ -36,7 +36,7 @@ export class ClientRecommender extends React.Component<RecommenderProps> {
private mainDoc?: RecommenderDocument;
private docVectors: Set<RecommenderDocument> = new Set();
- @observable private corr_matrix = [[0, 0], [0, 0]];
+ @observable private corr_matrix = [[0, 0], [0, 0]]; // for testing
constructor(props: RecommenderProps) {
//console.log("creating client recommender...");
@@ -90,16 +90,21 @@ export class ClientRecommender extends React.Component<RecommenderProps> {
* Returns list of {doc, similarity (to main doc)} in increasing score
*/
- public computeSimilarities() {
+ public computeSimilarities(distance_metric: string) {
ClientRecommender.Instance.docVectors.forEach((doc: RecommenderDocument) => {
if (ClientRecommender.Instance.mainDoc) {
- const distance = ClientRecommender.Instance.distance(ClientRecommender.Instance.mainDoc.vectorDoc, doc.vectorDoc, "cosine");
+ const distance = ClientRecommender.Instance.distance(ClientRecommender.Instance.mainDoc.vectorDoc, doc.vectorDoc, distance_metric);
doc.score = distance;
}
}
);
let doclist = Array.from(ClientRecommender.Instance.docVectors);
- doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => a.score - b.score);
+ if (distance_metric == "euclidian") {
+ doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => a.score - b.score);
+ }
+ else {
+ doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => b.score - a.score);
+ }
return doclist;
}
@@ -107,8 +112,8 @@ export class ClientRecommender extends React.Component<RecommenderProps> {
* Computes the mean of a set of vectors
*/
- public mean(paragraph: Set<number[]>, dataDoc: Doc, mainDoc: boolean) {
- const n = 200;
+ public mean(paragraph: Set<number[]>) {
+ const n = 512;
const num_words = paragraph.size;
let meanVector = new Array<number>(n).fill(0); // mean vector
if (num_words > 0) { // check to see if paragraph actually was vectorized
@@ -118,23 +123,20 @@ export class ClientRecommender extends React.Component<RecommenderProps> {
}
});
meanVector = meanVector.map(x => x / num_words);
- const internalDoc: RecommenderDocument = { actualDoc: dataDoc, vectorDoc: meanVector, score: 0 };
- if (mainDoc) ClientRecommender.Instance.mainDoc = internalDoc;
- ClientRecommender.Instance.addToDocSet(internalDoc);
}
return meanVector;
}
- public processVector(vector: number[], dataDoc: Doc, mainDoc: boolean) {
+ public processVector(vector: number[], dataDoc: Doc, isMainDoc: boolean) {
if (vector.length > 0) {
const internalDoc: RecommenderDocument = { actualDoc: dataDoc, vectorDoc: vector, score: 0 };
- if (mainDoc) ClientRecommender.Instance.mainDoc = internalDoc;
- ClientRecommender.Instance.addToDocSet(internalDoc);
+ ClientRecommender.Instance.addToDocSet(internalDoc, isMainDoc);
}
}
- private addToDocSet(internalDoc: RecommenderDocument) {
+ private addToDocSet(internalDoc: RecommenderDocument, isMainDoc: boolean) {
if (ClientRecommender.Instance.docVectors) {
+ if (isMainDoc) ClientRecommender.Instance.mainDoc = internalDoc;
ClientRecommender.Instance.docVectors.add(internalDoc);
}
}
@@ -143,22 +145,25 @@ export class ClientRecommender extends React.Component<RecommenderProps> {
* Uses Cognitive Services to extract keywords from a document
*/
- public async extractText(dataDoc: Doc, extDoc: Doc, internal: boolean = true, mainDoc: boolean = false) {
+ public async extractText(dataDoc: Doc, extDoc: Doc, internal: boolean = true, isMainDoc: boolean = false) {
let fielddata = Cast(dataDoc.data, RichTextField);
let data: string;
fielddata ? data = fielddata[ToPlainText]() : data = "";
let converter = async (results: any, data: string) => {
let keyterms = new List<string>(); // raw keywords
- let keyterms_counted = new List<string>(); // keywords, where each keyword is repeated as
- let kp_string: string = "";
+ // let keyterms_counted = new List<string>(); // keywords, where each keyword is repeated. input to w2v
+ let kp_string: string = ""; // keywords concatenated into a string. input into TF
let highKP: string[] = [""]; // most frequent keyphrase
let high = 0;
results.documents.forEach((doc: any) => {
let keyPhrases = doc.keyPhrases;
keyPhrases.map((kp: string) => {
- const frequency = this.countFrequencies(kp, data);
keyterms.push(kp);
- kp_string += kp + ", ";
+ const frequency = this.countFrequencies(kp, data); // frequency of keyphrase in paragraph
+ kp_string += kp + ", "; // ensures that if frequency is 0 for some reason kp is still added
+ for (let i = 0; i < frequency - 1; i++) {
+ kp_string += kp + ", "; // weights repeated keywords higher
+ }
// replaces highKP with new one
if (frequency > high) {
high = frequency;
@@ -168,24 +173,25 @@ export class ClientRecommender extends React.Component<RecommenderProps> {
else if (frequency === high) {
highKP.push(kp);
}
- let words = kp.split(" "); // separates phrase into words
- words = this.removeStopWords(words); // removes stop words if they appear in phrases
- words.forEach((word) => {
- //keyterms.push(word);
- for (let i = 0; i < frequency; i++) {
- keyterms_counted.push(word);
- }
- });
+ // let words = kp.split(" "); // separates phrase into words
+ // words = this.removeStopWords(words); // removes stop words if they appear in phrases
+ // words.forEach((word) => {
+ // for (let i = 0; i < frequency; i++) {
+ // keyterms_counted.push(word);
+ // }
+ // });
});
});
- const kts_counted = new List<string>();
- keyterms_counted.forEach(kt => kts_counted.push(kt.toLowerCase()));
+ // const kts_counted = new List<string>();
+ // keyterms_counted.forEach(kt => kts_counted.push(kt.toLowerCase()));
+ if (kp_string.length > 2) kp_string = kp_string.substring(0, kp_string.length - 2);
+ console.log("kp string: ", kp_string);
let values = "";
if (!internal) values = await this.sendRequest(highKP);
- return { keyterms: keyterms, keyterms_counted: kts_counted, values, kp_string: [kp_string] };
+ return { keyterms: keyterms, external_recommendations: values, kp_string: [kp_string] };
};
if (data != "") {
- return CognitiveServices.Text.Appliers.analyzer(dataDoc, extDoc, ["key words"], data, converter, mainDoc, internal);
+ return CognitiveServices.Text.Appliers.analyzer(dataDoc, extDoc, ["key words"], data, converter, isMainDoc, internal);
}
return;
}
@@ -196,7 +202,7 @@ export class ClientRecommender extends React.Component<RecommenderProps> {
*/
private countFrequencies(keyphrase: string, paragraph: string) {
- let data = paragraph.split(" ");
+ let data = paragraph.split(/ |\n/); // splits by new lines and spaces
let kp_array = keyphrase.split(" ");
let num_keywords = kp_array.length;
let par_length = data.length;
@@ -289,12 +295,6 @@ export class ClientRecommender extends React.Component<RecommenderProps> {
return new Promise<any>(promisified);
}
- processArxivResult = (result: any) => {
- var xmlDoc = result as XMLDocument;
- let text = xmlDoc.getElementsByTagName("title")[0].childNodes[0].nodeValue;
- console.log(text);
- }
-
render() {
return (<div className="wrapper">
</div>);
diff --git a/src/client/cognitive_services/CognitiveServices.ts b/src/client/cognitive_services/CognitiveServices.ts
index eb088763d..48519f916 100644
--- a/src/client/cognitive_services/CognitiveServices.ts
+++ b/src/client/cognitive_services/CognitiveServices.ts
@@ -15,7 +15,7 @@ type RequestExecutor = (apiKey: string, body: string, service: Service) => Promi
type AnalysisApplier<D> = (target: Doc, relevantKeys: string[], data: D, ...args: any) => any;
type BodyConverter<D> = (data: D) => string;
type Converter = (results: any) => Field;
-type TextConverter = (results: any, data: string) => Promise<{ keyterms: Field, keyterms_counted: Field, values: any, kp_string: string[] }>;
+type TextConverter = (results: any, data: string) => Promise<{ keyterms: Field, external_recommendations: any, kp_string: string[] }>;
export type Tag = { name: string, confidence: number };
export type Rectangle = { top: number, left: number, width: number, height: number };
@@ -264,7 +264,7 @@ export namespace CognitiveServices {
export namespace Appliers {
- export async function vectorize(keyterms: any, dataDoc: Doc, mainDoc: boolean = false, data: string) {
+ export async function vectorize(keyterms: any, dataDoc: Doc, mainDoc: boolean = false) {
console.log("vectorizing...");
//keyterms = ["father", "king"];
let args = { method: 'POST', uri: Utils.prepend("/recommender"), body: { keyphrases: keyterms }, json: true };
@@ -287,17 +287,16 @@ export namespace CognitiveServices {
);
}
- export const analyzer = async (dataDoc: Doc, target: Doc, keys: string[], data: string, converter: TextConverter, mainDoc: boolean = false, internal: boolean = true) => {
+ export const analyzer = async (dataDoc: Doc, target: Doc, keys: string[], data: string, converter: TextConverter, isMainDoc: boolean = false, internal: boolean = true) => {
let results = await ExecuteQuery(Service.Text, Manager, data);
console.log(results);
- let { keyterms, values, keyterms_counted, kp_string } = await converter(results, data);
- //target[keys[0]] = Docs.Get.DocumentHierarchyFromJson(results, "Key Word Analysis");
+ let { keyterms, external_recommendations, kp_string } = await converter(results, data);
target[keys[0]] = keyterms;
- console.log("analyzed!");
if (internal) {
- await vectorize(kp_string, dataDoc, mainDoc, data);
+ //await vectorize([data], dataDoc, isMainDoc);
+ await vectorize(kp_string, dataDoc, isMainDoc);
} else {
- return values;
+ return external_recommendations;
}
};
diff --git a/src/client/views/nodes/DocumentView.tsx b/src/client/views/nodes/DocumentView.tsx
index 070b1f426..ab2717eed 100644
--- a/src/client/views/nodes/DocumentView.tsx
+++ b/src/client/views/nodes/DocumentView.tsx
@@ -602,7 +602,7 @@ export class DocumentView extends DocComponent<DocumentViewProps, Document>(Docu
}
}
}));
- const doclist = ClientRecommender.Instance.computeSimilarities();
+ const doclist = ClientRecommender.Instance.computeSimilarities("cosine");
let recDocs: { preview: Doc, score: number }[] = [];
// tslint:disable-next-line: prefer-for-of
for (let i = 0; i < doclist.length; i++) {