Code Snippet: MultiTokenStream for Lucene.Net

Written by Troy Howard

16 December 2010

Someone asked in the Lucene.Net mailing list about implementing something similar to SOLR's copy-fields.

Here's a basic example of how to do something like that in a Lucene.Net Index.

  1using System;
  2using System.Collections.Generic;
  3using System.IO;
  4using System.Linq;
  5 
  6using Lucene.Net.Analysis;
  7using Lucene.Net.Analysis.Standard;
  8using Lucene.Net.Documents;
  9using Lucene.Net.Index;
 10using Lucene.Net.QueryParsers;
 11using Lucene.Net.Search;
 12using Lucene.Net.Store;
 13 
 14 
 15namespace MultiTokenStreamExample
 16{
 17    class Program
 18    {
 19        static void Main(string[] args)
 20        {
 21            const string text_content = "This is some content. 123 is a number. 456 is also a number.";
 22            const string numeric_content = "789 this is normal text";
 23            
 24            var standard = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
 25            var numeric = new NumericAnalyzer();
 26 
 27            var perField = new PerFieldAnalyzerWrapper(standard);
 28            perField.AddAnalyzer("numeric_content", numeric);
 29            
 30 
 31            // typical index creation 
 32 
 33            var directory = new RAMDirectory();
 34            var writer = new IndexWriter(directory, standard, IndexWriter.MaxFieldLength.UNLIMITED);
 35            
 36            var document = new Document();
 37 
 38            var text_content_field = new Field("text_content", text_content, Field.Store.YES, Field.Index.ANALYZED);
 39            var id_field = new Field("numeric_content", numeric_content, Field.Store.YES, Field.Index.ANALYZED);
 40            
 41            document.Add(text_content_field);
 42            document.Add(id_field);
 43            
 44 
 45            // special sauce
 46 
 47            var combined_field = 
 48                new MultiField("combined", 
 49                    new List<Fieldable> { text_content_field, id_field},
 50                    perField);
 51 
 52 
 53            document.Add(combined_field);
 54 
 55            writer.AddDocument(document);
 56            writer.Optimize();
 57            writer.Close();
 58 
 59 
 60            // test searching against our combined field...
 61 
 62            Searcher searcher = new IndexSearcher(directory, true);
 63 
 64            // hits on first field
 65            SearchCombined(searcher, "number");
 66 
 67            // hits on first field
 68            SearchCombined(searcher, "123");
 69 
 70            // hits on second field
 71            SearchCombined(searcher, "789");
 72 
 73            // does not hit on second field because numeric analyzer skips that content.
 74            SearchCombined(searcher, "normal");
 75 
 76            Console.ReadKey();
 77        }
 78 
 79        private static void SearchCombined(Searcher searcher, string queryString)
 80        {
 81            // Build a Query object
 82            QueryParser parser = new QueryParser("combined", new StandardAnalyzer());
 83            Query query = parser.Parse(queryString);
 84            
 85            // Search for the query
 86            Hits hits = searcher.Search(query);
 87 
 88          // Examine the Hits object to see if there were any matches
 89          int hitCount = hits.Length();
 90          if (hitCount == 0) {
 91              Console.WriteLine("No matches were found for \"" + queryString + "\"");
 92          }
 93          else {
 94              Console.WriteLine("Hits for \"" + queryString + "\" were found in quotes by:");
 95   
 96              // Iterate over the Documents in the Hits object
 97              for (int i = 0; i < hitCount; i++) {
 98                  Document doc = hits.Doc(i);
 99   
100                  // Print the value that we stored in the "title" field. Note
101                  // that this Field was not indexed, but (unlike the
102                  // "contents" field) was stored verbatim and can be
103                  // retrieved.
104                  Console.WriteLine("  " + (i + 1) + "] [id: " +doc.Get("id") + "] [text_content: " + doc.Get("text_content") + "]");
105              }
106          }
107          Console.WriteLine();
108 
109        }
110    }
111    
112    public class MultiField : Fieldable
113    {
114        public MultiField(string name, List<Fieldable> fields, PerFieldAnalyzerWrapper analyzerWrapper)
115        {
116            _name = name;
117            _fields = fields;
118            _analyzerWrapper = analyzerWrapper;
119        }
120 
121        private string _name;
122        private List<Fieldable> _fields;
123        private PerFieldAnalyzerWrapper _analyzerWrapper;
124 
125        #region Fieldable Members
126 
127        public void SetBoost(float boost)
128        {
129            //throw new Exception("The method or operation is not implemented.");
130        }
131 
132        public float GetBoost()
133        {
134            return 1;
135        }
136 
137        public string Name()
138        {
139            return _name;
140        }
141 
142        public string StringValue()
143        {
144            throw new Exception("The method or operation is not implemented.");
145            //StringBuilder sb = new StringBuilder();
146            
147            //foreach (var f in _fields)
148            //    sb.AppendLine(f.StringValue());
149 
150            //return sb.ToString();
151        }
152 
153        public System.IO.TextReader ReaderValue()
154        {
155            throw new Exception("The method or operation is not implemented.");
156            //var combined = default(TextReader);
157            
158            //foreach (var f in _fields)
159            //    combined = 
160            //        combined == default(TextReader) 
161            //        ? f.ReaderValue() 
162            //        : combined.Union(f.ReaderValue());
163 
164            //return combined;
165        }
166 
167        public byte[] BinaryValue()
168        {
169            throw new Exception("The method or operation is not implemented.");
170            //var ms = new MemoryStream();
171            //foreach (var f in _fields)
172            //{
173            //    var bytes = f.BinaryValue();
174            //    ms.Write(bytes, 0, 0);
175            //}
176 
177            //return ms.ToArray();
178        }
179 
180        public TokenStream TokenStreamValue()
181        {
182            return new MultiTokenStream(_fields.Select(a => GetTokenStream(a)));
183        }
184 
185        private TokenStream GetTokenStream(Fieldable f)
186        {
187            return 
188                f.TokenStreamValue() 
189                ?? _analyzerWrapper.TokenStream(f.Name(), new StringReader(f.StringValue()));
190        }
191 
192        public bool IsStored()
193        {
194            return false;
195        }
196 
197        public bool IsIndexed()
198        {
199            return true;
200        }
201 
202        public bool IsTokenized()
203        {
204            return true;
205        }
206 
207        public bool IsCompressed()
208        {
209            return false;
210        }
211 
212        public bool IsTermVectorStored()
213        {
214            return true;
215        }
216 
217        public bool IsStoreOffsetWithTermVector()
218        {
219            return true;
220        }
221 
222        public bool IsStorePositionWithTermVector()
223        {
224            return true;
225        }
226 
227        public bool IsBinary()
228        {
229            foreach (var f in _fields)
230                if (!f.IsBinary()) return false;
231 
232            return true;
233        }
234 
235        public bool GetOmitNorms()
236        {
237            foreach (var f in _fields)
238                if (!f.GetOmitNorms()) return false;
239 
240            return true;
241        }
242 
243        public void SetOmitNorms(bool omitNorms)
244        {
245            throw new Exception("The method or operation is not implemented.");
246        }
247 
248        public void SetOmitTf(bool omitTf)
249        {
250            throw new Exception("The method or operation is not implemented.");
251        }
252 
253        public bool GetOmitTf()
254        {
255            foreach (var f in _fields)
256                if (!f.GetOmitTf()) return false;
257 
258            return true;
259        }
260 
261        public bool IsLazy()
262        {
263            //foreach (var f in _fields)
264            //    if (!f.GetOmitNorms()) return false;
265 
266            return true;
267        }
268 
269        public int GetBinaryOffset()
270        {
271            return 0;
272        }
273 
274        public int GetBinaryLength()
275        {
276            return BinaryValue().Length;
277        }
278 
279        public byte[] GetBinaryValue()
280        {
281            return BinaryValue();
282        }
283 
284        public byte[] GetBinaryValue(byte[] result)
285        {
286            throw new Exception("The method or operation is not implemented.");
287        }
288 
289        #endregion
290    }
291 
292    // this is just an example to show a different kind of token stream.. 
293    public class NumericAnalyzer : Analyzer
294    {
295        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
296        {
297            return new NumericTokenizer(reader);
298        }
299    }
300    
301    public class NumericTokenizer : CharTokenizer
302    {
303        public NumericTokenizer(TextReader input)
304            : base(input)
305        {
306        }
307 
308        protected override bool IsTokenChar(char c)
309        {
310            // only consider numbers to be tokenizable characters
311            return char.IsNumber(c);
312        }
313    }
314 
315    public class MultiTokenStream : TokenStream
316    {
317        public MultiTokenStream(IEnumerable<TokenStream> tokenStreams)
318        {
319            _tokenStreams = new List<TokenStream>(tokenStreams);            
320        }
321 
322        private List<TokenStream> _tokenStreams;
323        private IEnumerator<TokenStream> _tokenStreamEnumerator;
324        private TokenStream _currentTokenStream;
325 
326        public override void ClearAttributes()
327        {
328            base.ClearAttributes();
329            foreach (TokenStream tokenStream in _tokenStreams)
330                tokenStream.ClearAttributes();
331        }
332 
333        public override bool IncrementToken()
334        {
335            if (_tokenStreamEnumerator == null)
336                _tokenStreamEnumerator = _tokenStreams.GetEnumerator();
337 
338            if (_currentTokenStream == null)
339            {
340                if (!_tokenStreamEnumerator.MoveNext())
341                    return false;
342 
343                _currentTokenStream = _tokenStreamEnumerator.Current;
344            }
345 
346            bool success = _currentTokenStream.IncrementToken();
347 
348            base.RestoreState(_currentTokenStream.CaptureState());
349 
350            if (!success)
351            {
352                if (!_tokenStreamEnumerator.MoveNext()) return false;
353                _currentTokenStream = _tokenStreamEnumerator.Current;
354                return true;
355            }
356 
357            return true;
358        }
359 
360        public override void Reset()
361        {
362            if(null != _tokenStreamEnumerator) 
363                _tokenStreamEnumerator.Reset();
364            _currentTokenStream = null;
365        }
366    }
367 
368    //// FROM: http://stackoverflow.com/questions/2925652/how-to-string-multiple-textreaders-together
369    //public static class Extensions
370    //{
371    //    public static TextReader Union(this TextReader first, TextReader second)
372    //    {
373    //        return new ChainedTextReader(first, second);
374    //    }
375 
376    //    private class ChainedTextReader : TextReader
377    //    {
378    //        private TextReader first;
379    //        private TextReader second;
380    //        private bool readFirst = true;
381 
382    //        public ChainedTextReader(TextReader first, TextReader second)
383    //        {
384    //            this.first = first;
385    //            this.second = second;
386    //        }
387 
388    //        public override int Peek()
389    //        {
390    //            if (readFirst)
391    //            {
392    //                return first.Peek();
393    //            }
394    //            else
395    //            {
396    //                return second.Peek();
397    //            }
398    //        }
399 
400    //        public override int Read()
401    //        {
402    //            if (readFirst)
403    //            {
404    //                int value = first.Read();
405    //                if (value == -1)
406    //                {
407    //                    readFirst = false;
408    //                }
409    //                else
410    //                {
411    //                    return value;
412    //                }
413    //            }
414    //            return second.Read();
415    //        }
416 
417    //        public override void Close()
418    //        {
419    //            first.Close();
420    //            second.Close();
421    //        }
422 
423    //        protected override void Dispose(bool disposing)
424    //        {
425    //            base.Dispose(disposing);
426    //            if (disposing)
427    //            {
428    //                first.Dispose();
429    //                second.Dispose();
430    //            }
431    //        }
432    //    }
433    //}
434}

This is available as a Gist here: https://gist.github.com/thoward/744444