aboutsummaryrefslogtreecommitdiff
path: root/solr-8.3.1/example/example-DIH/solr/atom
diff options
context:
space:
mode:
Diffstat (limited to 'solr-8.3.1/example/example-DIH/solr/atom')
-rw-r--r--solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml35
-rw-r--r--solr-8.3.1/example/example-DIH/solr/atom/conf/lang/stopwords_en.txt54
-rw-r--r--solr-8.3.1/example/example-DIH/solr/atom/conf/managed-schema106
-rw-r--r--solr-8.3.1/example/example-DIH/solr/atom/conf/protwords.txt17
-rw-r--r--solr-8.3.1/example/example-DIH/solr/atom/conf/solrconfig.xml64
-rw-r--r--solr-8.3.1/example/example-DIH/solr/atom/conf/synonyms.txt29
-rw-r--r--solr-8.3.1/example/example-DIH/solr/atom/conf/url_types.txt1
-rw-r--r--solr-8.3.1/example/example-DIH/solr/atom/core.properties1
8 files changed, 307 insertions, 0 deletions
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml b/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml
new file mode 100644
index 000000000..b7de812d0
--- /dev/null
+++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml
@@ -0,0 +1,35 @@
+<dataConfig>
+ <dataSource type="URLDataSource"/>
+ <document>
+
+ <entity name="stackoverflow"
+ url="https://stackoverflow.com/feeds/tag/solr"
+ processor="XPathEntityProcessor"
+ forEach="/feed|/feed/entry"
+ transformer="HTMLStripTransformer,RegexTransformer">
+
+ <!-- Pick this value up from the feed level and apply to all documents -->
+ <field column="lastchecked_dt" xpath="/feed/updated" commonField="true"/>
+
+ <!-- Keep only the final numeric part of the URL -->
+ <field column="id" xpath="/feed/entry/id" regex=".*/" replaceWith=""/>
+
+ <field column="title" xpath="/feed/entry/title"/>
+ <field column="author" xpath="/feed/entry/author/name"/>
+ <field column="category" xpath="/feed/entry/category/@term"/>
+ <field column="link" xpath="/feed/entry/link[@rel='alternate']/@href"/>
+
+ <!-- Use transformers to convert HTML into plain text.
+ There is also an UpdateRequestProcess to trim remaining spaces.
+ -->
+ <field column="summary" xpath="/feed/entry/summary" stripHTML="true" regex="( |\n)+" replaceWith=" "/>
+
+ <!-- Ignore namespaces when matching XPath -->
+ <field column="rank" xpath="/feed/entry/rank"/>
+
+ <field column="published_dt" xpath="/feed/entry/published"/>
+ <field column="updated_dt" xpath="/feed/entry/updated"/>
+ </entity>
+
+ </document>
+</dataConfig>
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/lang/stopwords_en.txt b/solr-8.3.1/example/example-DIH/solr/atom/conf/lang/stopwords_en.txt
new file mode 100644
index 000000000..2c164c0b2
--- /dev/null
+++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/lang/stopwords_en.txt
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# a couple of test stopwords to test that the words are really being
+# configured from this file:
+stopworda
+stopwordb
+
+# Standard english stop words taken from Lucene's StopAnalyzer
+a
+an
+and
+are
+as
+at
+be
+but
+by
+for
+if
+in
+into
+is
+it
+no
+not
+of
+on
+or
+such
+that
+the
+their
+then
+there
+these
+they
+this
+to
+was
+will
+with
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/managed-schema b/solr-8.3.1/example/example-DIH/solr/atom/conf/managed-schema
new file mode 100644
index 000000000..58751520d
--- /dev/null
+++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/managed-schema
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<schema name="example-DIH-atom" version="1.6">
+ <uniqueKey>id</uniqueKey>
+
+ <field name="id" type="string" indexed="true" stored="true" required="true"/>
+ <field name="title" type="text_en_splitting" indexed="true" stored="true"/>
+ <field name="author" type="string" indexed="true" stored="true"/>
+ <field name="category" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="link" type="string" indexed="true" stored="true"/>
+ <field name="summary" type="text_en_splitting" indexed="true" stored="true"/>
+ <field name="rank" type="pint" indexed="true" stored="true"/>
+
+ <dynamicField name="*_dt" type="pdate" indexed="true" stored="true"/>
+
+ <!-- Catch-all field, aggregating all "useful to search as text" fields via the copyField instructions -->
+ <field name="text" type="text_en_splitting" indexed="true" stored="false" multiValued="true"/>
+
+ <field name="urls" type="url_only" indexed="true" stored="false"/>
+
+
+ <copyField source="id" dest="text"/>
+ <copyField source="title" dest="text"/>
+ <copyField source="author" dest="text"/>
+ <copyField source="category" dest="text"/>
+ <copyField source="summary" dest="text"/>
+
+ <!-- extract URLs from summary for faceting -->
+ <copyField source="summary" dest="urls"/>
+
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true"/>
+ <fieldType name="pint" class="solr.IntPointField" docValues="true"/>
+ <fieldType name="pdate" class="solr.DatePointField" docValues="true"/>
+
+
+ <!-- A text field with defaults appropriate for English, plus
+ aggressive word-splitting and autophrase features enabled.
+ This field is just like text_en, except it adds
+ WordDelimiterFilter to enable splitting and matching of
+ words on case-change, alpha numeric boundaries, and
+ non-alphanumeric chars. This means certain compound word
+ cases will work, for example query "wi fi" will match
+ document "WiFi" or "wi-fi".
+ -->
+ <fieldType name="text_en_splitting" class="solr.TextField"
+ positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal. -->
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1"
+ catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="lang/stopwords_en.txt"
+ />
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1"
+ catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Field type that extracts URLs from the text.
+ As the stored representation is not changed, it is only useful for faceting.
+ It is not terribly useful for searching URLs either, as there are too many special symbols.
+ -->
+ <fieldType name="url_only" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.UAX29URLEmailTokenizerFactory" maxTokenLength="255"/>
+ <filter class="solr.TypeTokenFilterFactory" types="url_types.txt" useWhitelist="true"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+</schema>
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/protwords.txt b/solr-8.3.1/example/example-DIH/solr/atom/conf/protwords.txt
new file mode 100644
index 000000000..1303e42a0
--- /dev/null
+++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/protwords.txt
@@ -0,0 +1,17 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+# Use a protected word file to protect against the stemmer reducing two
+# unrelated words to the same base word.
+
+lucene
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/solrconfig.xml b/solr-8.3.1/example/example-DIH/solr/atom/conf/solrconfig.xml
new file mode 100644
index 000000000..3694c1531
--- /dev/null
+++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/solrconfig.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+ This is a DEMO configuration, highlighting elements
+ specifically needed to get this example running
+ such as libraries and request handler specifics.
+
+ It uses defaults or does not define most of production-level settings
+ such as various caches or auto-commit policies.
+
+ See Solr Reference Guide and other examples for
+ more details on a well configured solrconfig.xml
+ https://lucene.apache.org/solr/guide/the-well-configured-solr-instance.html
+-->
+<config>
+
+ <!-- Controls what version of Lucene various components of Solr
+ adhere to. Generally, you want to use the latest version to
+ get all bug fixes and improvements. It is highly recommended
+ that you fully re-index after changing this setting as it can
+ affect both how text is indexed and queried.
+ -->
+ <luceneMatchVersion>8.3.1</luceneMatchVersion>
+
+ <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar"/>
+
+ <requestHandler name="/select" class="solr.SearchHandler">
+ <lst name="defaults">
+ <str name="echoParams">explicit</str>
+ <str name="df">text</str>
+ <!-- Change from JSON to XML format (the default prior to Solr 7.0)
+ <str name="wt">xml</str>
+ -->
+ </lst>
+ </requestHandler>
+
+ <requestHandler name="/dataimport" class="solr.DataImportHandler">
+ <lst name="defaults">
+ <str name="config">atom-data-config.xml</str>
+ <str name="processor">trim_text</str>
+ </lst>
+ </requestHandler>
+
+ <updateProcessor class="solr.processor.TrimFieldUpdateProcessorFactory" name="trim_text">
+ <str name="typeName">text_en_splitting</str>
+ </updateProcessor>
+
+</config>
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/synonyms.txt b/solr-8.3.1/example/example-DIH/solr/atom/conf/synonyms.txt
new file mode 100644
index 000000000..eab4ee875
--- /dev/null
+++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/synonyms.txt
@@ -0,0 +1,29 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+#some test synonym mappings unlikely to appear in real input text
+aaafoo => aaabar
+bbbfoo => bbbfoo bbbbar
+cccfoo => cccbar cccbaz
+fooaaa,baraaa,bazaaa
+
+# Some synonym groups specific to this example
+GB,gib,gigabyte,gigabytes
+MB,mib,megabyte,megabytes
+Television, Televisions, TV, TVs
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
+#after us won't split it into two words.
+
+# Synonym mappings can be used for spelling correction too
+pixima => pixma
+
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/url_types.txt b/solr-8.3.1/example/example-DIH/solr/atom/conf/url_types.txt
new file mode 100644
index 000000000..808f31384
--- /dev/null
+++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/url_types.txt
@@ -0,0 +1 @@
+<URL>
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/core.properties b/solr-8.3.1/example/example-DIH/solr/atom/core.properties
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/solr-8.3.1/example/example-DIH/solr/atom/core.properties
@@ -0,0 +1 @@
+