diff options
author | mehekj <mehek.jethani@gmail.com> | 2022-03-20 10:29:42 -0400 |
---|---|---|
committer | mehekj <mehek.jethani@gmail.com> | 2022-03-20 10:29:42 -0400 |
commit | 0a5e02a87fdabff5ff8399829ff857cae90fc1e2 (patch) | |
tree | b7c05080dac66366768f23e59a43f62533a22415 /src/client/ClientRecommender.tsx | |
parent | 1f7cf7babc76ecff5aef5fe663c48e067e85dd26 (diff) |
Revert "Merge remote-tracking branch 'origin/speedups2' into temporalmedia-mehek"
This reverts commit 1f7cf7babc76ecff5aef5fe663c48e067e85dd26, reversing
changes made to 1e3ad4de06f83eab54628de660529fefb9a0dc63.
Diffstat (limited to 'src/client/ClientRecommender.tsx')
-rw-r--r-- | src/client/ClientRecommender.tsx | 421 |
1 files changed, 421 insertions, 0 deletions
diff --git a/src/client/ClientRecommender.tsx b/src/client/ClientRecommender.tsx new file mode 100644 index 000000000..1d4653471 --- /dev/null +++ b/src/client/ClientRecommender.tsx @@ -0,0 +1,421 @@ +import { Doc, FieldResult } from "../fields/Doc"; +import { StrCast, Cast } from "../fields/Types"; +import { List } from "../fields/List"; +import { CognitiveServices, Confidence, Tag, Service } from "./cognitive_services/CognitiveServices"; +import React = require("react"); +import { observer } from "mobx-react"; +import { observable, action, computed, reaction } from "mobx"; +// var assert = require('assert'); +// var sw = require('stopword'); +// var FeedParser = require('feedparser'); +// var https = require('https'); +import "./ClientRecommender.scss"; +import { JSXElement } from "babel-types"; +import { RichTextField } from "../fields/RichTextField"; +import { ToPlainText } from "../fields/FieldSymbols"; +import { listSpec } from "../fields/Schema"; +import { ComputedField } from "../fields/ScriptField"; +import { ImageField } from "../fields/URLField"; +import { KeyphraseQueryView } from "./views/KeyphraseQueryView"; +import { Networking } from "./Network"; + +export interface RecommenderProps { + title: string; +} + +/** + * actualDoc: datadoc + * vectorDoc: mean vector of text + * score: similarity score to main doc + */ + +export interface RecommenderDocument { + actualDoc: Doc; + vectorDoc: number[]; + score: number; +} + +const fieldkey = "data"; + +@observer +export class ClientRecommender extends React.Component<RecommenderProps> { + + static Instance: ClientRecommender; + private mainDoc?: RecommenderDocument; + private docVectors: Set<RecommenderDocument> = new Set(); + public _queries: string[] = []; + + @observable private corr_matrix = [[0, 0], [0, 0]]; // for testing + + constructor(props: RecommenderProps) { + super(props); + if (!ClientRecommender.Instance) ClientRecommender.Instance = this; + ClientRecommender.Instance.docVectors = new Set(); + //ClientRecommender.Instance.corr_matrix = [[0, 0], [0, 0]]; + } + + @action + public reset_docs() { + ClientRecommender.Instance.docVectors = new Set(); + ClientRecommender.Instance.mainDoc = undefined; + ClientRecommender.Instance.corr_matrix = [[0, 0], [0, 0]]; + } + + /*** + * Computes the cosine similarity between two vectors in Euclidean space. + */ + + private distance(vector1: number[], vector2: number[], metric: string = "cosine") { + // assert(vector1.length === vector2.length, "Vectors are not the same length"); + let similarity: number; + switch (metric) { + case "cosine": + var dotproduct = 0; + var mA = 0; + var mB = 0; + for (let i = 0; i < vector1.length; i++) { // here you missed the i++ + dotproduct += (vector1[i] * vector2[i]); + mA += (vector1[i] * vector1[i]); + mB += (vector2[i] * vector2[i]); + } + mA = Math.sqrt(mA); + mB = Math.sqrt(mB); + similarity = (dotproduct) / ((mA) * (mB)); // here you needed extra brackets + return similarity; + case "euclidian": + var sum = 0; + for (let i = 0; i < vector1.length; i++) { + sum += Math.pow(vector1[i] - vector2[i], 2); + } + similarity = Math.sqrt(sum); + return similarity; + default: + return 0; + } + } + + /** + * Returns list of {doc, similarity (to main doc)} in increasing score + */ + + public computeSimilarities(distance_metric: string) { + const parameters: any = {}; + Networking.PostToServer("/IBMAnalysis", parameters).then(response => { + console.log("ANALYSIS RESULTS! ", response); + }); + ClientRecommender.Instance.docVectors.forEach((doc: RecommenderDocument) => { + if (ClientRecommender.Instance.mainDoc) { + const distance = ClientRecommender.Instance.distance(ClientRecommender.Instance.mainDoc.vectorDoc, doc.vectorDoc, distance_metric); + doc.score = distance; + } + } + ); + const doclist = Array.from(ClientRecommender.Instance.docVectors); + if (distance_metric === "euclidian") { + doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => a.score - b.score); + } + else { + doclist.sort((a: RecommenderDocument, b: RecommenderDocument) => b.score - a.score); + } + return doclist; + } + + /*** + * Computes the mean of a set of vectors + */ + + public mean(paragraph: Set<number[]>) { + const n = 512; + const num_words = paragraph.size; + let meanVector = new Array<number>(n).fill(0); // mean vector + if (num_words > 0) { // check to see if paragraph actually was vectorized + paragraph.forEach((wordvec: number[]) => { + for (let i = 0; i < n; i++) { + meanVector[i] += wordvec[i]; + } + }); + meanVector = meanVector.map(x => x / num_words); + } + return meanVector; + } + + /*** + * Processes sentence vector as Recommender Document, adds to Doc Set. + */ + + public processVector(vector: number[], dataDoc: Doc, isMainDoc: boolean) { + if (vector.length > 0) { + const internalDoc: RecommenderDocument = { actualDoc: dataDoc, vectorDoc: vector, score: 0 }; + ClientRecommender.Instance.addToDocSet(internalDoc, isMainDoc); + } + } + + /*** + * Adds to Doc set. Updates mainDoc (one clicked) if necessary. + */ + + private addToDocSet(internalDoc: RecommenderDocument, isMainDoc: boolean) { + if (ClientRecommender.Instance.docVectors) { + if (isMainDoc) ClientRecommender.Instance.mainDoc = internalDoc; + ClientRecommender.Instance.docVectors.add(internalDoc); + } + } + + /*** + * Generates tags for an image using Cognitive Services + */ + + generateMetadata = async (dataDoc: Doc, extDoc: Doc, threshold: Confidence = Confidence.Excellent) => { + const converter = (results: any) => { + const tagDoc = new Doc; + const tagsList = new List(); + results.tags.map((tag: Tag) => { + tagsList.push(tag.name); + const sanitized = tag.name.replace(" ", "_"); + tagDoc[sanitized] = ComputedField.MakeFunction(`(${tag.confidence} >= this.confidence) ? ${tag.confidence} : "${ComputedField.undefined}"`); + }); + extDoc.generatedTags = tagsList; + tagDoc.title = "Generated Tags Doc"; + tagDoc.confidence = threshold; + return tagDoc; + }; + const url = this.url(dataDoc); + if (url) { + return CognitiveServices.Image.Appliers.ProcessImage(extDoc, ["generatedTagsDoc"], url, Service.ComputerVision, converter); + } + } + + /*** + * Gets URL of image + */ + + private url(dataDoc: Doc) { + const data = Cast(Doc.GetProto(dataDoc)[fieldkey], ImageField); + return data ? data.url.href : undefined; + } + + /*** + * Uses Cognitive Services to extract keywords from a document + */ + + public async extractText(dataDoc: Doc, extDoc: Doc, internal: boolean = true, api: string = "bing", isMainDoc: boolean = false, image: boolean = false) { + // STEP 1. Consolidate data of document. Depends on type of document. + let data: string = ""; + let taglist: FieldResult<List<string>> = undefined; + if (image) { + if (!extDoc.generatedTags) await this.generateMetadata(dataDoc, extDoc); // TODO: Automatically generate tags. Need to ask Sam about this. + if (extDoc.generatedTags) { + taglist = Cast(extDoc.generatedTags, listSpec("string")); + taglist!.forEach(tag => { + data += tag + ", "; + }); + } + } + else { + const fielddata = Cast(dataDoc.data, RichTextField, null); + data = fielddata?.Text || ""; + } + + // STEP 2. Upon receiving response from Text Cognitive Services, do additional processing on keywords. + // Currently we are still using Cognitive Services for internal recommendations, but in the future this might not be necessary. + + const converter = async (results: any, data: string, isImage: boolean = false) => { + let keyterms = new List<string>(); // raw keywords + let kp_string: string = ""; // keywords*frequency concatenated into a string. input into TF + let highKP: string[] = [""]; // most frequent keyphrase + let high = 0; + + if (isImage) { // no keyphrase processing necessary + kp_string = data; + if (taglist) { + keyterms = taglist; + highKP = [taglist[0]]; + } + } + else { // text processing + results.documents.forEach((doc: any) => { + const keyPhrases = doc.keyPhrases; // returned by Cognitive Services + keyPhrases.map((kp: string) => { + keyterms.push(kp); + const frequency = this.countFrequencies(kp, data); // frequency of keyphrase in paragraph + kp_string += kp + ", "; // ensures that if frequency is 0 for some reason kp is still added + for (let i = 0; i < frequency - 1; i++) { + kp_string += kp + ", "; // weights repeated keywords higher + } + // replaces highKP with new one + if (frequency > high) { + high = frequency; + highKP = [kp]; + } + // appends to current highKP phrase + else if (frequency === high) { + highKP.push(kp); + } + }); + }); + } + if (kp_string.length > 2) kp_string = kp_string.substring(0, kp_string.length - 2); // strips extra comma and space if there are a lot of keywords + console.log("kp_string: ", kp_string); + + let ext_recs = ""; + // Pushing keyword extraction to IBM for external recommendations. Should shift to internal eventually. + if (!internal) { + const parameters: any = { + 'language': 'en', + 'text': data, + 'features': { + 'keywords': { + 'sentiment': true, + 'emotion': true, + 'limit': 3 + } + } + }; + await Networking.PostToServer("/IBMAnalysis", parameters).then(response => { + const sorted_keywords = response.result.keywords; + if (sorted_keywords.length > 0) { + console.log("IBM keyphrase", sorted_keywords[0]); + highKP = []; + for (let i = 0; i < 5; i++) { + if (sorted_keywords[i]) { + highKP.push(sorted_keywords[i].text); + } + } + keyterms = new List<string>(highKP); + } + }); + //let kpqv = new KeyphraseQueryView({ keyphrases: ["hello"] }); + ext_recs = await this.sendRequest([highKP[0]], api); + } + + // keyterms: list for extDoc, kp_string: input to TF, ext_recs: {titles, urls} of retrieved results from highKP query + return { keyterms: keyterms, external_recommendations: ext_recs, kp_string: [kp_string] }; + }; + + // STEP 3: Start recommendation pipeline. Branches off into internal and external in Cognitive Services + if (data !== "") { + return CognitiveServices.Text.Appliers.analyzer(dataDoc, extDoc, ["key words"], data, converter, isMainDoc, internal); + } + return; + } + + /** + * + * Counts frequencies of keyphrase in paragraph. + */ + + private countFrequencies(keyphrase: string, paragraph: string) { + const data = paragraph.split(/ |\n/); // splits by new lines and spaces + const kp_array = keyphrase.split(" "); + const num_keywords = kp_array.length; + const par_length = data.length; + let frequency = 0; + // slides keyphrase windows across paragraph and checks if it matches with corresponding paragraph slice + for (let i = 0; i <= par_length - num_keywords; i++) { + const window = data.slice(i, i + num_keywords); + if (JSON.stringify(window).toLowerCase() === JSON.stringify(kp_array).toLowerCase() || kp_array.every(val => window.includes(val))) { + frequency++; + } + } + return frequency; + } + + /** + * + * API for sending arXiv request. + */ + + private async sendRequest(keywords: string[], api: string) { + let query = ""; + keywords.forEach((kp: string) => query += " " + kp); + if (api === "arxiv") { + return new Promise<any>(resolve => { + this.arxivrequest(query).then(resolve); + }); + } + else if (api === "bing") { + return new Promise<any>(resolve => { + this.bingWebSearch(query).then(resolve); + }); + } + else { + console.log("no api specified :("); + } + + } + + /** + * Request to Bing API. Most of code is in Cognitive Services. + */ + + bingWebSearch = async (query: string) => { + const converter = async (results: any) => { + const title_vals: string[] = []; + const url_vals: string[] = []; + results.webPages.value.forEach((doc: any) => { + title_vals.push(doc.name); + url_vals.push(doc.url); + }); + return { title_vals, url_vals }; + }; + return CognitiveServices.BingSearch.Appliers.analyzer(query, converter); + } + + /** + * Actual request to the arXiv server for ML articles. + */ + + arxivrequest = async (query: string) => { + const xhttp = new XMLHttpRequest(); + const serveraddress = "http://export.arxiv.org/api"; + const maxresults = 5; + const endpoint = serveraddress + "/query?search_query=all:" + query + "&start=0&max_results=" + maxresults.toString(); + const promisified = (resolve: any, reject: any) => { + xhttp.onreadystatechange = function () { + if (this.readyState === 4) { + const result = xhttp.response; + const xml = xhttp.responseXML; + console.log("arXiv Result: ", xml); + switch (this.status) { + case 200: + const title_vals: string[] = []; + const url_vals: string[] = []; + if (xml) { + const titles = xml.getElementsByTagName("title"); + let counter = 1; + if (titles && titles.length > 1) { + while (counter <= maxresults) { + const title = titles[counter].childNodes[0].nodeValue!; + title_vals.push(title); + counter++; + } + } + const ids = xml.getElementsByTagName("id"); + counter = 1; + if (ids && ids.length > 1) { + while (counter <= maxresults) { + const url = ids[counter].childNodes[0].nodeValue!; + url_vals.push(url); + counter++; + } + } + } + return resolve({ title_vals, url_vals }); + case 400: + default: + return reject(result); + } + } + }; + xhttp.open("GET", endpoint, true); + xhttp.send(); + }; + return new Promise<any>(promisified); + } + + render() { + return (<div className="wrapper"> + </div>); + } + +}
\ No newline at end of file |