blob: b7de812d00502a6e2fbf446a9f2294ccfa0613ad (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
<dataConfig>
<dataSource type="URLDataSource"/>
<document>
<entity name="stackoverflow"
url="https://stackoverflow.com/feeds/tag/solr"
processor="XPathEntityProcessor"
forEach="/feed|/feed/entry"
transformer="HTMLStripTransformer,RegexTransformer">
<!-- Pick this value up from the feed level and apply to all documents -->
<field column="lastchecked_dt" xpath="/feed/updated" commonField="true"/>
<!-- Keep only the final numeric part of the URL -->
<field column="id" xpath="/feed/entry/id" regex=".*/" replaceWith=""/>
<field column="title" xpath="/feed/entry/title"/>
<field column="author" xpath="/feed/entry/author/name"/>
<field column="category" xpath="/feed/entry/category/@term"/>
<field column="link" xpath="/feed/entry/link[@rel='alternate']/@href"/>
<!-- Use transformers to convert HTML into plain text.
There is also an UpdateRequestProcess to trim remaining spaces.
-->
<field column="summary" xpath="/feed/entry/summary" stripHTML="true" regex="( |\n)+" replaceWith=" "/>
<!-- Ignore namespaces when matching XPath -->
<field column="rank" xpath="/feed/entry/rank"/>
<field column="published_dt" xpath="/feed/entry/published"/>
<field column="updated_dt" xpath="/feed/entry/updated"/>
</entity>
</document>
</dataConfig>
|