diff options
author | mehekj <mehek.jethani@gmail.com> | 2022-03-17 17:08:25 -0400 |
---|---|---|
committer | mehekj <mehek.jethani@gmail.com> | 2022-03-17 17:08:25 -0400 |
commit | 1f7cf7babc76ecff5aef5fe663c48e067e85dd26 (patch) | |
tree | adab9947ee887b57f2ff630a78139853b2011f92 /src/client/ClientRecommender.tsx | |
parent | 1e3ad4de06f83eab54628de660529fefb9a0dc63 (diff) | |
parent | 73ba1a0395167ab5949f71d0c82fa7188d37ab5c (diff) |
Merge remote-tracking branch 'origin/speedups2' into temporalmedia-mehek
Diffstat (limited to 'src/client/ClientRecommender.tsx')
-rw-r--r-- | src/client/ClientRecommender.tsx | 421 |
1 files changed, 0 insertions, 421 deletions
diff --git a/src/client/ClientRecommender.tsx b/src/client/ClientRecommender.tsx deleted file mode 100644 index 1d4653471..000000000 --- a/src/client/ClientRecommender.tsx +++ /dev/null @@ -1,421 +0,0 @@ -import { Doc, FieldResult } from "../fields/Doc"; -import { StrCast, Cast } from "../fields/Types"; -import { List } from "../fields/List"; -import { CognitiveServices, Confidence, Tag, Service } from "./cognitive_services/CognitiveServices"; -import React = require("react"); -import { observer } from "mobx-react"; -import { observable, action, computed, reaction } from "mobx"; -// var assert = require('assert'); -// var sw = require('stopword'); -// var FeedParser = require('feedparser'); -// var https = require('https'); -import "./ClientRecommender.scss"; -import { JSXElement } from "babel-types"; -import { RichTextField } from "../fields/RichTextField"; -import { ToPlainText } from "../fields/FieldSymbols"; -import { listSpec } from "../fields/Schema"; -import { ComputedField } from "../fields/ScriptField"; -import { ImageField } from "../fields/URLField"; -import { KeyphraseQueryView } from "./views/KeyphraseQueryView"; -import { Networking } from "./Network"; - -export interface RecommenderProps { - title: string; -} - -/** - * actualDoc: datadoc - * vectorDoc: mean vector of text - * score: similarity score to main doc - */ - -export interface RecommenderDocument { - actualDoc: Doc; - vectorDoc: number[]; - score: number; -} - -const fieldkey = "data"; - -@observer -export class ClientRecommender extends React.Component<RecommenderProps> { - - static Instance: ClientRecommender; - private mainDoc?: RecommenderDocument; - private docVectors: Set<RecommenderDocument> = new Set(); - public _queries: string[] = []; - - @observable private corr_matrix = [[0, 0], [0, 0]]; // for testing - - constructor(props: RecommenderProps) { - super(props); - if (!ClientRecommender.Instance) ClientRecommender.Instance = this; - ClientRecommender.Instance.docVectors = new Set(); - //ClientRecommender.Instance.corr_matrix = [[0, 0], [0, 0]]; - } - - @action - public reset_docs() { - ClientRecommender.Instance.docVectors = new Set(); - ClientRecommender.Instance.mainDoc = undefined; - ClientRecommender.Instance.corr_matrix = [[0, 0], [0, 0]]; - } - - /*** - * Computes the cosine similarity between two vectors in Euclidean space. - */ - - private distance(vector1: number[], vector2: number[], metric: string = "cosine") { - // assert(vector1.length === vector2.length, "Vectors are not the same length"); - let similarity: number; - switch (metric) { - case "cosine": - var dotproduct = 0; - var mA = 0; - var mB = 0; - for (let i = 0; i < vector1.length; i++) { // here you missed the i++ - dotproduct += (vector1[i] * vector2[i]); - mA += (vector1[i] * vector1[i]); - mB += (vector2[i] * vector2[i]); - } - mA = Math.sqrt(mA); - mB = Math.sqrt(mB); - similarity = (dotproduct) / ((mA) * (mB)); // here you needed extra brackets - return similarity; - case "euclidian": - var sum = 0; - for (let i = 0; i < vector1.length; i++) { - sum += Math.pow(vector1[i] - vector2[i], 2); - } - similarity = Math.sqrt(sum); - return similarity; - default: - return 0; - } - } - - /** - * Returns list of {doc, similarity (to main doc)} in increasing score - */ - - public computeSimilarities(distance_metric: string) { - const parameters: any = {}; - Networking.PostToServer("/IBMAnalysis", parameters).then(response => { - console.log("ANALYSIS RESULTS! ", response); - }); - ClientRecommender.Instance.docVectors.forEach((doc: RecommenderDocument) => { - if (ClientRecommender.Instance.mainDoc) { - const distance = ClientRecommender.Instance.distance(ClientRecommender.Instance.mainDoc.vectorDoc, doc.vectorDoc, distance_metric); - doc.score = distance; - } - } - ); - const doclist = Array.from(ClientRecommender.Instance.docVectors); - if (distance_metric === "euclidian") { - doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => a.score - b.score); - } - else { - doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => b.score - a.score); - } - return doclist; - } - - /*** - * Computes the mean of a set of vectors - */ - - public mean(paragraph: Set<number[]>) { - const n = 512; - const num_words = paragraph.size; - let meanVector = new Array<number>(n).fill(0); // mean vector - if (num_words > 0) { // check to see if paragraph actually was vectorized - paragraph.forEach((wordvec: number[]) => { - for (let i = 0; i < n; i++) { - meanVector[i] += wordvec[i]; - } - }); - meanVector = meanVector.map(x => x / num_words); - } - return meanVector; - } - - /*** - * Processes sentence vector as Recommender Document, adds to Doc Set. - */ - - public processVector(vector: number[], dataDoc: Doc, isMainDoc: boolean) { - if (vector.length > 0) { - const internalDoc: RecommenderDocument = { actualDoc: dataDoc, vectorDoc: vector, score: 0 }; - ClientRecommender.Instance.addToDocSet(internalDoc, isMainDoc); - } - } - - /*** - * Adds to Doc set. Updates mainDoc (one clicked) if necessary. - */ - - private addToDocSet(internalDoc: RecommenderDocument, isMainDoc: boolean) { - if (ClientRecommender.Instance.docVectors) { - if (isMainDoc) ClientRecommender.Instance.mainDoc = internalDoc; - ClientRecommender.Instance.docVectors.add(internalDoc); - } - } - - /*** - * Generates tags for an image using Cognitive Services - */ - - generateMetadata = async (dataDoc: Doc, extDoc: Doc, threshold: Confidence = Confidence.Excellent) => { - const converter = (results: any) => { - const tagDoc = new Doc; - const tagsList = new List(); - results.tags.map((tag: Tag) => { - tagsList.push(tag.name); - const sanitized = tag.name.replace(" ", "_"); - tagDoc[sanitized] = ComputedField.MakeFunction(`(${tag.confidence} >= this.confidence) ? ${tag.confidence} : "${ComputedField.undefined}"`); - }); - extDoc.generatedTags = tagsList; - tagDoc.title = "Generated Tags Doc"; - tagDoc.confidence = threshold; - return tagDoc; - }; - const url = this.url(dataDoc); - if (url) { - return CognitiveServices.Image.Appliers.ProcessImage(extDoc, ["generatedTagsDoc"], url, Service.ComputerVision, converter); - } - } - - /*** - * Gets URL of image - */ - - private url(dataDoc: Doc) { - const data = Cast(Doc.GetProto(dataDoc)[fieldkey], ImageField); - return data ? data.url.href : undefined; - } - - /*** - * Uses Cognitive Services to extract keywords from a document - */ - - public async extractText(dataDoc: Doc, extDoc: Doc, internal: boolean = true, api: string = "bing", isMainDoc: boolean = false, image: boolean = false) { - // STEP 1. Consolidate data of document. Depends on type of document. - let data: string = ""; - let taglist: FieldResult<List<string>> = undefined; - if (image) { - if (!extDoc.generatedTags) await this.generateMetadata(dataDoc, extDoc); // TODO: Automatically generate tags. Need to ask Sam about this. - if (extDoc.generatedTags) { - taglist = Cast(extDoc.generatedTags, listSpec("string")); - taglist!.forEach(tag => { - data += tag + ", "; - }); - } - } - else { - const fielddata = Cast(dataDoc.data, RichTextField, null); - data = fielddata?.Text || ""; - } - - // STEP 2. Upon receiving response from Text Cognitive Services, do additional processing on keywords. - // Currently we are still using Cognitive Services for internal recommendations, but in the future this might not be necessary. - - const converter = async (results: any, data: string, isImage: boolean = false) => { - let keyterms = new List<string>(); // raw keywords - let kp_string: string = ""; // keywords*frequency concatenated into a string. input into TF - let highKP: string[] = [""]; // most frequent keyphrase - let high = 0; - - if (isImage) { // no keyphrase processing necessary - kp_string = data; - if (taglist) { - keyterms = taglist; - highKP = [taglist[0]]; - } - } - else { // text processing - results.documents.forEach((doc: any) => { - const keyPhrases = doc.keyPhrases; // returned by Cognitive Services - keyPhrases.map((kp: string) => { - keyterms.push(kp); - const frequency = this.countFrequencies(kp, data); // frequency of keyphrase in paragraph - kp_string += kp + ", "; // ensures that if frequency is 0 for some reason kp is still added - for (let i = 0; i < frequency - 1; i++) { - kp_string += kp + ", "; // weights repeated keywords higher - } - // replaces highKP with new one - if (frequency > high) { - high = frequency; - highKP = [kp]; - } - // appends to current highKP phrase - else if (frequency === high) { - highKP.push(kp); - } - }); - }); - } - if (kp_string.length > 2) kp_string = kp_string.substring(0, kp_string.length - 2); // strips extra comma and space if there are a lot of keywords - console.log("kp_string: ", kp_string); - - let ext_recs = ""; - // Pushing keyword extraction to IBM for external recommendations. Should shift to internal eventually. - if (!internal) { - const parameters: any = { - 'language': 'en', - 'text': data, - 'features': { - 'keywords': { - 'sentiment': true, - 'emotion': true, - 'limit': 3 - } - } - }; - await Networking.PostToServer("/IBMAnalysis", parameters).then(response => { - const sorted_keywords = response.result.keywords; - if (sorted_keywords.length > 0) { - console.log("IBM keyphrase", sorted_keywords[0]); - highKP = []; - for (let i = 0; i < 5; i++) { - if (sorted_keywords[i]) { - highKP.push(sorted_keywords[i].text); - } - } - keyterms = new List<string>(highKP); - } - }); - //let kpqv = new KeyphraseQueryView({ keyphrases: ["hello"] }); - ext_recs = await this.sendRequest([highKP[0]], api); - } - - // keyterms: list for extDoc, kp_string: input to TF, ext_recs: {titles, urls} of retrieved results from highKP query - return { keyterms: keyterms, external_recommendations: ext_recs, kp_string: [kp_string] }; - }; - - // STEP 3: Start recommendation pipeline. Branches off into internal and external in Cognitive Services - if (data !== "") { - return CognitiveServices.Text.Appliers.analyzer(dataDoc, extDoc, ["key words"], data, converter, isMainDoc, internal); - } - return; - } - - /** - * - * Counts frequencies of keyphrase in paragraph. - */ - - private countFrequencies(keyphrase: string, paragraph: string) { - const data = paragraph.split(/ |\n/); // splits by new lines and spaces - const kp_array = keyphrase.split(" "); - const num_keywords = kp_array.length; - const par_length = data.length; - let frequency = 0; - // slides keyphrase windows across paragraph and checks if it matches with corresponding paragraph slice - for (let i = 0; i <= par_length - num_keywords; i++) { - const window = data.slice(i, i + num_keywords); - if (JSON.stringify(window).toLowerCase() === JSON.stringify(kp_array).toLowerCase() || kp_array.every(val => window.includes(val))) { - frequency++; - } - } - return frequency; - } - - /** - * - * API for sending arXiv request. - */ - - private async sendRequest(keywords: string[], api: string) { - let query = ""; - keywords.forEach((kp: string) => query += " " + kp); - if (api === "arxiv") { - return new Promise<any>(resolve => { - this.arxivrequest(query).then(resolve); - }); - } - else if (api === "bing") { - return new Promise<any>(resolve => { - this.bingWebSearch(query).then(resolve); - }); - } - else { - console.log("no api specified :("); - } - - } - - /** - * Request to Bing API. Most of code is in Cognitive Services. - */ - - bingWebSearch = async (query: string) => { - const converter = async (results: any) => { - const title_vals: string[] = []; - const url_vals: string[] = []; - results.webPages.value.forEach((doc: any) => { - title_vals.push(doc.name); - url_vals.push(doc.url); - }); - return { title_vals, url_vals }; - }; - return CognitiveServices.BingSearch.Appliers.analyzer(query, converter); - } - - /** - * Actual request to the arXiv server for ML articles. - */ - - arxivrequest = async (query: string) => { - const xhttp = new XMLHttpRequest(); - const serveraddress = "http://export.arxiv.org/api"; - const maxresults = 5; - const endpoint = serveraddress + "/query?search_query=all:" + query + "&start=0&max_results=" + maxresults.toString(); - const promisified = (resolve: any, reject: any) => { - xhttp.onreadystatechange = function () { - if (this.readyState === 4) { - const result = xhttp.response; - const xml = xhttp.responseXML; - console.log("arXiv Result: ", xml); - switch (this.status) { - case 200: - const title_vals: string[] = []; - const url_vals: string[] = []; - if (xml) { - const titles = xml.getElementsByTagName("title"); - let counter = 1; - if (titles && titles.length > 1) { - while (counter <= maxresults) { - const title = titles[counter].childNodes[0].nodeValue!; - title_vals.push(title); - counter++; - } - } - const ids = xml.getElementsByTagName("id"); - counter = 1; - if (ids && ids.length > 1) { - while (counter <= maxresults) { - const url = ids[counter].childNodes[0].nodeValue!; - url_vals.push(url); - counter++; - } - } - } - return resolve({ title_vals, url_vals }); - case 400: - default: - return reject(result); - } - } - }; - xhttp.open("GET", endpoint, true); - xhttp.send(); - }; - return new Promise<any>(promisified); - } - - render() { - return (<div className="wrapper"> - </div>); - } - -}
\ No newline at end of file |