diff options
Diffstat (limited to 'solr-8.3.1/example/example-DIH/solr/atom/conf')
7 files changed, 306 insertions, 0 deletions
diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml b/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml new file mode 100644 index 000000000..b7de812d0 --- /dev/null +++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/atom-data-config.xml @@ -0,0 +1,35 @@ +<dataConfig> + <dataSource type="URLDataSource"/> + <document> + + <entity name="stackoverflow" + url="https://stackoverflow.com/feeds/tag/solr" + processor="XPathEntityProcessor" + forEach="/feed|/feed/entry" + transformer="HTMLStripTransformer,RegexTransformer"> + + <!-- Pick this value up from the feed level and apply to all documents --> + <field column="lastchecked_dt" xpath="/feed/updated" commonField="true"/> + + <!-- Keep only the final numeric part of the URL --> + <field column="id" xpath="/feed/entry/id" regex=".*/" replaceWith=""/> + + <field column="title" xpath="/feed/entry/title"/> + <field column="author" xpath="/feed/entry/author/name"/> + <field column="category" xpath="/feed/entry/category/@term"/> + <field column="link" xpath="/feed/entry/link[@rel='alternate']/@href"/> + + <!-- Use transformers to convert HTML into plain text. + There is also an UpdateRequestProcess to trim remaining spaces. + --> + <field column="summary" xpath="/feed/entry/summary" stripHTML="true" regex="( |\n)+" replaceWith=" "/> + + <!-- Ignore namespaces when matching XPath --> + <field column="rank" xpath="/feed/entry/rank"/> + + <field column="published_dt" xpath="/feed/entry/published"/> + <field column="updated_dt" xpath="/feed/entry/updated"/> + </entity> + + </document> +</dataConfig> diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/lang/stopwords_en.txt b/solr-8.3.1/example/example-DIH/solr/atom/conf/lang/stopwords_en.txt new file mode 100644 index 000000000..2c164c0b2 --- /dev/null +++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/managed-schema b/solr-8.3.1/example/example-DIH/solr/atom/conf/managed-schema new file mode 100644 index 000000000..58751520d --- /dev/null +++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/managed-schema @@ -0,0 +1,106 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<schema name="example-DIH-atom" version="1.6"> + <uniqueKey>id</uniqueKey> + + <field name="id" type="string" indexed="true" stored="true" required="true"/> + <field name="title" type="text_en_splitting" indexed="true" stored="true"/> + <field name="author" type="string" indexed="true" stored="true"/> + <field name="category" type="string" indexed="true" stored="true" multiValued="true"/> + <field name="link" type="string" indexed="true" stored="true"/> + <field name="summary" type="text_en_splitting" indexed="true" stored="true"/> + <field name="rank" type="pint" indexed="true" stored="true"/> + + <dynamicField name="*_dt" type="pdate" indexed="true" stored="true"/> + + <!-- Catch-all field, aggregating all "useful to search as text" fields via the copyField instructions --> + <field name="text" type="text_en_splitting" indexed="true" stored="false" multiValued="true"/> + + <field name="urls" type="url_only" indexed="true" stored="false"/> + + + <copyField source="id" dest="text"/> + <copyField source="title" dest="text"/> + <copyField source="author" dest="text"/> + <copyField source="category" dest="text"/> + <copyField source="summary" dest="text"/> + + <!-- extract URLs from summary for faceting --> + <copyField source="summary" dest="urls"/> + + <fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true"/> + <fieldType name="pint" class="solr.IntPointField" docValues="true"/> + <fieldType name="pdate" class="solr.DatePointField" docValues="true"/> + + + <!-- A text field with defaults appropriate for English, plus + aggressive word-splitting and autophrase features enabled. + This field is just like text_en, except it adds + WordDelimiterFilter to enable splitting and matching of + words on case-change, alpha numeric boundaries, and + non-alphanumeric chars. This means certain compound word + cases will work, for example query "wi fi" will match + document "WiFi" or "wi-fi". + --> + <fieldType name="text_en_splitting" class="solr.TextField" + positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer type="index"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <!-- Case insensitive stop word removal. --> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/> + <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" + catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + <filter class="solr.FlattenGraphFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="lang/stopwords_en.txt" + /> + <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" + catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Field type that extracts URLs from the text. + As the stored representation is not changed, it is only useful for faceting. + It is not terribly useful for searching URLs either, as there are too many special symbols. + --> + <fieldType name="url_only" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.UAX29URLEmailTokenizerFactory" maxTokenLength="255"/> + <filter class="solr.TypeTokenFilterFactory" types="url_types.txt" useWhitelist="true"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.KeywordTokenizerFactory"/> + </analyzer> + </fieldType> + +</schema> diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/protwords.txt b/solr-8.3.1/example/example-DIH/solr/atom/conf/protwords.txt new file mode 100644 index 000000000..1303e42a0 --- /dev/null +++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/protwords.txt @@ -0,0 +1,17 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Use a protected word file to protect against the stemmer reducing two +# unrelated words to the same base word. + +lucene diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/solrconfig.xml b/solr-8.3.1/example/example-DIH/solr/atom/conf/solrconfig.xml new file mode 100644 index 000000000..3694c1531 --- /dev/null +++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/solrconfig.xml @@ -0,0 +1,64 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<!-- + This is a DEMO configuration, highlighting elements + specifically needed to get this example running + such as libraries and request handler specifics. + + It uses defaults or does not define most of production-level settings + such as various caches or auto-commit policies. + + See Solr Reference Guide and other examples for + more details on a well configured solrconfig.xml + https://lucene.apache.org/solr/guide/the-well-configured-solr-instance.html +--> +<config> + + <!-- Controls what version of Lucene various components of Solr + adhere to. Generally, you want to use the latest version to + get all bug fixes and improvements. It is highly recommended + that you fully re-index after changing this setting as it can + affect both how text is indexed and queried. + --> + <luceneMatchVersion>8.3.1</luceneMatchVersion> + + <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar"/> + + <requestHandler name="/select" class="solr.SearchHandler"> + <lst name="defaults"> + <str name="echoParams">explicit</str> + <str name="df">text</str> + <!-- Change from JSON to XML format (the default prior to Solr 7.0) + <str name="wt">xml</str> + --> + </lst> + </requestHandler> + + <requestHandler name="/dataimport" class="solr.DataImportHandler"> + <lst name="defaults"> + <str name="config">atom-data-config.xml</str> + <str name="processor">trim_text</str> + </lst> + </requestHandler> + + <updateProcessor class="solr.processor.TrimFieldUpdateProcessorFactory" name="trim_text"> + <str name="typeName">text_en_splitting</str> + </updateProcessor> + +</config> diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/synonyms.txt b/solr-8.3.1/example/example-DIH/solr/atom/conf/synonyms.txt new file mode 100644 index 000000000..eab4ee875 --- /dev/null +++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/synonyms.txt @@ -0,0 +1,29 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + diff --git a/solr-8.3.1/example/example-DIH/solr/atom/conf/url_types.txt b/solr-8.3.1/example/example-DIH/solr/atom/conf/url_types.txt new file mode 100644 index 000000000..808f31384 --- /dev/null +++ b/solr-8.3.1/example/example-DIH/solr/atom/conf/url_types.txt @@ -0,0 +1 @@ +<URL> |