Index: hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/SolrAnalyzerTest.java
===================================================================
--- hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/SolrAnalyzerTest.java (revision 19164)
+++ hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/SolrAnalyzerTest.java (working copy)
@@ -174,6 +174,12 @@
tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
AnalyzerUtils.assertTokensEqual( tokens, new String[] { "foo", "bar" } );
+ // CharStreamFactories test
+ analyzer = fts.getSearchFactory().getAnalyzer( "mapping_char_analyzer" );
+ text = "CORAÇÃO DE MELÃO";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "CORACAO", "DE", "MELAO" } );
+
fts.close();
}
Index: hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/Team.java
===================================================================
--- hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/Team.java (revision 19164)
+++ hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/Team.java (working copy)
@@ -28,12 +28,12 @@
import javax.persistence.GeneratedValue;
import javax.persistence.Id;
-import org.apache.solr.analysis.HTMLStripStandardTokenizerFactory;
-import org.apache.solr.analysis.HTMLStripWhitespaceTokenizerFactory;
+import org.apache.solr.analysis.HTMLStripCharFilterFactory;
import org.apache.solr.analysis.ISOLatin1AccentFilterFactory;
import org.apache.solr.analysis.LengthFilterFactory;
import org.apache.solr.analysis.LowerCaseFilterFactory;
import org.apache.solr.analysis.LowerCaseTokenizerFactory;
+import org.apache.solr.analysis.MappingCharFilterFactory;
import org.apache.solr.analysis.PorterStemFilterFactory;
import org.apache.solr.analysis.ShingleFilterFactory;
import org.apache.solr.analysis.SnowballPorterFilterFactory;
@@ -49,6 +49,7 @@
import org.hibernate.search.annotations.Analyzer;
import org.hibernate.search.annotations.AnalyzerDef;
import org.hibernate.search.annotations.AnalyzerDefs;
+import org.hibernate.search.annotations.CharFilterDef;
import org.hibernate.search.annotations.DocumentId;
import org.hibernate.search.annotations.Field;
import org.hibernate.search.annotations.Indexed;
@@ -87,17 +88,19 @@
filters = {
@TokenFilterDef(factory = StandardFilterFactory.class)
}),
-
@AnalyzerDef(name = "html_standard_analyzer",
- tokenizer = @TokenizerDef(factory = HTMLStripStandardTokenizerFactory.class),
+ charFilters = {
+ @CharFilterDef(factory = HTMLStripCharFilterFactory.class)
+ },
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
filters = {
@TokenFilterDef(factory = StandardFilterFactory.class)
}),
@AnalyzerDef(name = "html_whitespace_analyzer",
- tokenizer = @TokenizerDef(factory = HTMLStripWhitespaceTokenizerFactory.class),
- filters = {
- @TokenFilterDef(factory = StandardFilterFactory.class)
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
+ charFilters = {
+ @CharFilterDef(factory = HTMLStripCharFilterFactory.class)
}),
@AnalyzerDef(name = "trim_analyzer",
@@ -123,7 +126,10 @@
}),
@AnalyzerDef(name = "word_analyzer",
- tokenizer = @TokenizerDef(factory = HTMLStripStandardTokenizerFactory.class),
+ charFilters = {
+ @CharFilterDef(factory = HTMLStripCharFilterFactory.class)
+ },
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
filters = {
@TokenFilterDef(factory = WordDelimiterFilterFactory.class, params = {
@Parameter(name = "splitOnCaseChange", value = "1")
@@ -131,7 +137,10 @@
}),
@AnalyzerDef(name = "synonym_analyzer",
- tokenizer = @TokenizerDef(factory = HTMLStripStandardTokenizerFactory.class),
+ charFilters = {
+ @CharFilterDef(factory = HTMLStripCharFilterFactory.class)
+ },
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
filters = {
@TokenFilterDef(factory = SynonymFilterFactory.class, params = {
@Parameter(name = "synonyms",
@@ -140,7 +149,10 @@
}),
@AnalyzerDef(name = "shingle_analyzer",
- tokenizer = @TokenizerDef(factory = HTMLStripStandardTokenizerFactory.class),
+ charFilters = {
+ @CharFilterDef(factory = HTMLStripCharFilterFactory.class)
+ },
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
filters = {
@TokenFilterDef(factory = ShingleFilterFactory.class)
}),
@@ -152,7 +164,23 @@
@Parameter(name = "encoder", value = "Metaphone"),
@Parameter(name = "inject", value = "false")
})
- })
+ }),
+
+ @AnalyzerDef(name = "html_char_analyzer",
+ charFilters = {
+ @CharFilterDef(factory = HTMLStripCharFilterFactory.class)
+ },
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class)
+ ),
+
+ @AnalyzerDef(name = "mapping_char_analyzer",
+ charFilters = {
+ @CharFilterDef(factory = MappingCharFilterFactory.class, params = {
+ @Parameter(name = "mapping", value = "org/hibernate/search/test/analyzer/solr/mapping-chars.properties")
+ })
+ },
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class)
+ )
})
public class Team {
@Id
Index: hibernate-search/src/test/resources/org/hibernate/search/test/analyzer/solr/mapping-chars.properties
===================================================================
--- hibernate-search/src/test/resources/org/hibernate/search/test/analyzer/solr/mapping-chars.properties (revision 0)
+++ hibernate-search/src/test/resources/org/hibernate/search/test/analyzer/solr/mapping-chars.properties (revision 0)
@@ -0,0 +1,80 @@
+# À => A
+"\u00C0" => "A"
+
+# Á => A
+"\u00C1" => "A"
+
+# Â => A
+"\u00C2" => "A"
+
+# Ã => A
+"\u00C3" => "A"
+
+# Ä => A
+"\u00C4" => "A"
+
+# Å => A
+"\u00C5" => "A"
+
+# Æ => AE
+"\u00C6" => "AE"
+
+# Ç => C
+"\u00C7" => "C"
+
+# È => E
+"\u00C8" => "E"
+
+# É => E
+"\u00C9" => "E"
+
+# Ê => E
+"\u00CA" => "E"
+
+# Ë => E
+"\u00CB" => "E"
+
+# Ì => I
+"\u00CC" => "I"
+
+# Í => I
+"\u00CD" => "I"
+
+# Î => I
+"\u00CE" => "I"
+
+# Ï => I
+"\u00CF" => "I"
+
+# IJ => IJ
+"\u0132" => "IJ"
+
+# Ð => D
+"\u00D0" => "D"
+
+# Ñ => N
+"\u00D1" => "N"
+
+# Ò => O
+"\u00D2" => "O"
+
+# Ó => O
+"\u00D3" => "O"
+
+# Ô => O
+"\u00D4" => "O"
+
+# Õ => O
+"\u00D5" => "O"
+
+# Ö => O
+"\u00D6" => "O"
+
+# Ø => O
+"\u00D8" => "O"
+
+# Œ => OE
+"\u0152" => "OE"
+
+# Þ
+"\u00DE" => "TH"
\ No newline at end of file
Index: hibernate-search/src/main/docbook/en-US/modules/mapping.xml
===================================================================
--- hibernate-search/src/main/docbook/en-US/modules/mapping.xml (revision 19164)
+++ hibernate-search/src/main/docbook/en-US/modules/mapping.xml (working copy)
@@ -693,6 +693,12 @@
+ a list of char filters: each char filter is responsible to
+ pre-process input characters before the tokenization. Char filters can add,
+ change or remove characters; one common usage is for characters normalization
+
+
+
a tokenizer: responsible for tokenizing the input stream
into individual words
@@ -704,15 +710,16 @@
- This separation of tasks - a tokenizer followed by a list of
+ This separation of tasks - a list of char filters, and a tokenizer followed by a list of
filters - allows for easy reuse of each individual component and let
you build your customized analyzer in a very flexible way (just like
- Lego). Generally speaking the Tokenizer starts
- the analysis process by turning the character input into tokens which
+ Lego). Generally speaking the char filters do some
+ pre-processing in the character input, then the Tokenizer starts
+ the tokenizing process by turning the character input into tokens which
are then further processed by the TokenFilters.
Hibernate Search supports this infrastructure by utilizing the Solr
analyzer framework. Make sure to add solr-core.jar and
- solr-common.jar to your classpath to
+ solr-solrj.jar to your classpath to
use analyzer definitions. In case you also want to utilizing a
snowball stemmer also include the
lucene-snowball.jar. Other Solr analyzers might
@@ -727,6 +734,11 @@
framework
@AnalyzerDef(name="customanalyzer",
+ charFilters = {
+ @CharFilterDef(factory = MappingCharFilterFactory.class, params = {
+ @Parameter(name = "mapping", value = "org/hibernate/search/test/analyzer/solr/mapping-chars.properties")
+ })
+ },
tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
filters = {
@TokenFilterDef(factory = ISOLatin1AccentFilterFactory.class),
@@ -741,9 +753,12 @@
}
- A tokenizer is defined by its factory which is responsible for
- building the tokenizer and using the optional list of parameters. This
- example use the standard tokenizer. A filter is defined by its factory
+ A char filter is defined by its factory which is responsible for
+ building the char filter and using the optional list of parameters.
+ In our example, a mapping char filter is used, and will replace
+ characters in the input based on the rules specified in the mapping
+ file. A tokenizer is also defined by its factory.
+ This example use the standard tokenizer. A filter is defined by its factory
which is responsible for creating the filter instance using the
optional parameters. In our example, the StopFilter filter is built
reading the dedicated words property file and is expected to ignore
@@ -751,7 +766,7 @@
factory.
- Filters are applied in the order they are defined in the
+ Filters and char filters are applied in the order they are defined in the
@AnalyzerDef annotation. Make sure to think
twice about this order.
@@ -800,13 +815,53 @@
Available analyzers
- Solr and Lucene come with a lot of useful default tokenizers and
- filters. You can find a complete list of tokenizer factories and
+ Solr and Lucene come with a lot of useful default char filters, tokenizers and
+ filters. You can find a complete list of char filter factories, tokenizer factories and
filter factories at http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters.
Let check a few of them.
+ Some of the available char filters
+
+
+
+ Factory
+
+ Description
+
+ parameters
+
+
+
+
+ MappingCharFilterFactory
+
+ Replaces one or more characters with one or more characters, based on mappings
+ specified in the resource file
+
+ mapping: points to a resource file containing the mappings
+ using the format:
+
+ "á" => "a"
+ "ñ" => "n"
+ "ø" => "o"
+
+
+
+
+ HTMLStripCharFilterFactory
+
+ Remove HTML standard tags, keeping the text
+
+ none
+
+
+
+
+
+
+
Some of the available tokenizers
@@ -833,7 +888,7 @@
HTMLStripStandardTokenizerFactory
Remove HTML tags, keep the text and pass it to a
- StandardTokenizer
+ StandardTokenizer. @Deprecated, use the HTMLStripCharFilterFactory instead
none
Index: hibernate-search/src/main/java/org/hibernate/search/annotations/CharFilterDef.java
===================================================================
--- hibernate-search/src/main/java/org/hibernate/search/annotations/CharFilterDef.java (revision 0)
+++ hibernate-search/src/main/java/org/hibernate/search/annotations/CharFilterDef.java (revision 0)
@@ -0,0 +1,52 @@
+/**
+ * Hibernate, Relational Persistence for Idiomatic Java
+ *
+ * Copyright (c) 2009, Red Hat, Inc. and/or its affiliates or third-party contributors as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
+ */
+package org.hibernate.search.annotations;
+
+import org.apache.solr.analysis.CharFilterFactory;
+
+import java.lang.annotation.Documented;
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+/**
+ * Define a CharFilterFactory
and its parameters
+ *
+ * @author Gustavo Fernandes
+ */
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ ElementType.TYPE, ElementType.FIELD, ElementType.METHOD})
+@Documented
+public @interface CharFilterDef {
+ /**
+ * @return the TokenFilterFactory
class which shall be instantiated.
+ */
+ public abstract Class extends CharFilterFactory> factory();
+
+ /**
+ * @return Optional parameters passed to the CharFilterFactory
.
+ */
+ public abstract Parameter[] params() default { };
+}
Index: hibernate-search/src/main/java/org/hibernate/search/annotations/AnalyzerDef.java
===================================================================
--- hibernate-search/src/main/java/org/hibernate/search/annotations/AnalyzerDef.java (revision 19164)
+++ hibernate-search/src/main/java/org/hibernate/search/annotations/AnalyzerDef.java (working copy)
@@ -53,6 +53,11 @@
String name();
/**
+ * @return CharFilters used. The filters are applied in the defined order
+ */
+ CharFilterDef[] charFilters() default { };
+
+ /**
* @return Tokenizer used.
*/
TokenizerDef tokenizer();
Index: hibernate-search/src/main/java/org/hibernate/search/impl/SolrAnalyzerBuilder.java
===================================================================
--- hibernate-search/src/main/java/org/hibernate/search/impl/SolrAnalyzerBuilder.java (revision 19164)
+++ hibernate-search/src/main/java/org/hibernate/search/impl/SolrAnalyzerBuilder.java (working copy)
@@ -29,6 +29,7 @@
import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerFactory;
@@ -36,6 +37,7 @@
import org.apache.solr.common.ResourceLoader;
import org.hibernate.search.annotations.AnalyzerDef;
+import org.hibernate.search.annotations.CharFilterDef;
import org.hibernate.search.annotations.TokenizerDef;
import org.hibernate.search.annotations.TokenFilterDef;
import org.hibernate.search.annotations.Parameter;
@@ -65,7 +67,9 @@
tokenFactory.init( getMapOfParameters( token.params() ) );
final int length = analyzerDef.filters().length;
+ final int charLength = analyzerDef.charFilters().length;
TokenFilterFactory[] filters = new TokenFilterFactory[length];
+ CharFilterFactory[] charFilters = new CharFilterFactory[charLength];
ResourceLoader resourceLoader = new HibernateSearchResourceLoader();
for ( int index = 0 ; index < length ; index++ ) {
TokenFilterDef filterDef = analyzerDef.filters()[index];
@@ -75,7 +79,15 @@
((ResourceLoaderAware)filters[index]).inform( resourceLoader );
}
}
- return new TokenizerChain(tokenFactory, filters);
+ for ( int index = 0 ; index < charFilters.length ; index++ ) {
+ CharFilterDef charFilterDef = analyzerDef.charFilters()[index];
+ charFilters[index] = (CharFilterFactory) instantiate( charFilterDef.factory() );
+ charFilters[index].init( getMapOfParameters( charFilterDef.params() ) );
+ if ( charFilters[index] instanceof ResourceLoaderAware ) {
+ ((ResourceLoaderAware)charFilters[index]).inform( resourceLoader );
+ }
+ }
+ return new TokenizerChain( charFilters, tokenFactory, filters );
}
private static Object instantiate(Class clazz) {