aboutsummaryrefslogtreecommitdiff
path: root/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml
diff options
context:
space:
mode:
authorkimdahey <claire_kim1@brown.edu>2020-01-16 11:31:41 -0500
committerkimdahey <claire_kim1@brown.edu>2020-01-16 11:31:41 -0500
commit6be0e19ed0bd13f3796f542affa5a2e52674650c (patch)
tree1be222ea9341ecd8020fad3149035fa650a8a07f /solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml
parent5cde81d8c6b4dcd8d0796f8669b668763957f395 (diff)
parente410cde0e430553002d4e1a2f64364b57b65fdbc (diff)
merged w master
Diffstat (limited to 'solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml')
-rw-r--r--solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml35
1 files changed, 35 insertions, 0 deletions
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml b/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml
new file mode 100644
index 000000000..b7de812d0
--- /dev/null
+++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml
@@ -0,0 +1,35 @@
+<dataConfig>
+ <dataSource type="URLDataSource"/>
+ <document>
+
+ <entity name="stackoverflow"
+ url="https://stackoverflow.com/feeds/tag/solr"
+ processor="XPathEntityProcessor"
+ forEach="/feed|/feed/entry"
+ transformer="HTMLStripTransformer,RegexTransformer">
+
+ <!-- Pick this value up from the feed level and apply to all documents -->
+ <field column="lastchecked_dt" xpath="/feed/updated" commonField="true"/>
+
+ <!-- Keep only the final numeric part of the URL -->
+ <field column="id" xpath="/feed/entry/id" regex=".*/" replaceWith=""/>
+
+ <field column="title" xpath="/feed/entry/title"/>
+ <field column="author" xpath="/feed/entry/author/name"/>
+ <field column="category" xpath="/feed/entry/category/@term"/>
+ <field column="link" xpath="/feed/entry/link[@rel='alternate']/@href"/>
+
+ <!-- Use transformers to convert HTML into plain text.
+ There is also an UpdateRequestProcess to trim remaining spaces.
+ -->
+ <field column="summary" xpath="/feed/entry/summary" stripHTML="true" regex="( |\n)+" replaceWith=" "/>
+
+ <!-- Ignore namespaces when matching XPath -->
+ <field column="rank" xpath="/feed/entry/rank"/>
+
+ <field column="published_dt" xpath="/feed/entry/published"/>
+ <field column="updated_dt" xpath="/feed/entry/updated"/>
+ </entity>
+
+ </document>
+</dataConfig>