1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
import { v4 as uuidv4 } from 'uuid';
import { Networking } from '../../../../Network';
import { BaseTool } from './BaseTool';
import { Observation } from '../types/types';
import { ParametersType, ToolInfo } from '../types/tool_types';
const websiteInfoScraperToolParams = [
{
name: 'urls',
type: 'string[]',
description: 'The URLs of the websites to scrape',
required: true,
max_inputs: 3,
},
] as const;
type WebsiteInfoScraperToolParamsType = typeof websiteInfoScraperToolParams;
const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
name: 'websiteInfoScraper',
description: 'Scrape detailed information from specific websites relevant to the user query. Returns the text content of the webpages for further analysis and grounding.',
citationRules: `
Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response:
1. Grounded Text Tag Structure:
- Wrap all text derived from the scraped website(s) in <grounded_text> tags.
- **Do not include non-sourced information** in <grounded_text> tags.
- Use a single <grounded_text> tag for content derived from a single website. If citing multiple websites, create new <grounded_text> tags for each.
- Ensure each <grounded_text> tag has a citation index corresponding to the scraped URL.
2. Citation Tag Structure:
- Create a <citation> tag for each distinct piece of information used from the website(s).
- Each <citation> tag must reference a URL chunk using the chunk_id attribute.
- For URL-based citations, leave the citation content empty, but reference the chunk_id and type as 'url'.
3. Structural Integrity Checks:
- Ensure all opening and closing tags are matched properly.
- Verify that all citation_index attributes in <grounded_text> tags correspond to valid citations.
- Do not over-cite—cite only the most relevant parts of the websites.
Example Usage:
<answer>
<grounded_text citation_index="1">
Based on data from the World Bank, economic growth has stabilized in recent years, following a surge in investments.
</grounded_text>
<grounded_text citation_index="2">
According to information retrieved from the International Monetary Fund, the inflation rate has been gradually decreasing since 2020.
</grounded_text>
<citations>
<citation index="1" chunk_id="1234" type="url"></citation>
<citation index="2" chunk_id="5678" type="url"></citation>
</citations>
<follow_up_questions>
<question>What are the long-term economic impacts of increased investments on GDP?</question>
<question>How might inflation trends affect future monetary policy?</question>
<question>Are there additional factors that could influence economic growth beyond investments and inflation?</question>
</follow_up_questions>
</answer>
***NOTE***: Ensure that the response is structured correctly and adheres to the guidelines provided. Also, if needed/possible, cite multiple websites to provide a comprehensive response.
`,
parameterRules: websiteInfoScraperToolParams,
};
export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParamsType> {
private _addLinkedUrlDoc: (url: string, id: string) => void;
constructor(addLinkedUrlDoc: (url: string, id: string) => void) {
super(websiteInfoScraperToolInfo);
this._addLinkedUrlDoc = addLinkedUrlDoc;
}
async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
const urls = args.urls;
// Create an array of promises, each one handling a website scrape for a URL
const scrapingPromises = urls.map(async url => {
try {
const { website_plain_text } = await Networking.PostToServer('/scrapeWebsite', { url });
const id = uuidv4();
this._addLinkedUrlDoc(url, id);
return {
type: 'text',
text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
} as Observation;
} catch (error) {
console.log(error);
return {
type: 'text',
text: `An error occurred while scraping the website: ${url}`,
} as Observation;
}
});
// Wait for all scraping promises to resolve
const results = await Promise.all(scrapingPromises);
return results;
}
}
|