Code Snippet: MultiTokenStream for Lucene.Net

Written by Troy Howard

16 December 2010

Someone asked in the Lucene.Net mailing list about implementing something similar to SOLR's copy-fields.

Here's a basic example of how to do something like that in a Lucene.Net Index.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;

using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;


namespace MultiTokenStreamExample
{
    class Program
    {
        static void Main(string[] args)
        {
            const string text_content = "This is some content. 123 is a number. 456 is also a number.";
            const string numeric_content = "789 this is normal text";

            var standard = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
            var numeric = new NumericAnalyzer();

            var perField = new PerFieldAnalyzerWrapper(standard);
            perField.AddAnalyzer("numeric_content", numeric);


            // typical index creation 

            var directory = new RAMDirectory();
            var writer = new IndexWriter(directory, standard, IndexWriter.MaxFieldLength.UNLIMITED);

            var document = new Document();

            var text_content_field = new Field("text_content", text_content, Field.Store.YES, Field.Index.ANALYZED);
            var id_field = new Field("numeric_content", numeric_content, Field.Store.YES, Field.Index.ANALYZED);

            document.Add(text_content_field);
            document.Add(id_field);


            // special sauce

            var combined_field = 
                new MultiField("combined", 
                    new List<Fieldable> { text_content_field, id_field},
                    perField);


            document.Add(combined_field);

            writer.AddDocument(document);
            writer.Optimize();
            writer.Close();


            // test searching against our combined field...

            Searcher searcher = new IndexSearcher(directory, true);

            // hits on first field
            SearchCombined(searcher, "number");

            // hits on first field
            SearchCombined(searcher, "123");

            // hits on second field
            SearchCombined(searcher, "789");

            // does not hit on second field because numeric analyzer skips that content.
            SearchCombined(searcher, "normal");

            Console.ReadKey();
        }

        private static void SearchCombined(Searcher searcher, string queryString)
        {
            // Build a Query object
            QueryParser parser = new QueryParser("combined", new StandardAnalyzer());
            Query query = parser.Parse(queryString);

            // Search for the query
            Hits hits = searcher.Search(query);

          // Examine the Hits object to see if there were any matches
          int hitCount = hits.Length();
          if (hitCount == 0) {
              Console.WriteLine("No matches were found for \"" + queryString + "\"");
          }
          else {
              Console.WriteLine("Hits for \"" + queryString + "\" were found in quotes by:");

              // Iterate over the Documents in the Hits object
              for (int i = 0; i < hitCount; i++) {
                  Document doc = hits.Doc(i);

                  // Print the value that we stored in the "title" field. Note
                  // that this Field was not indexed, but (unlike the
                  // "contents" field) was stored verbatim and can be
                  // retrieved.
                  Console.WriteLine("  " + (i + 1) + "] [id: " +doc.Get("id") + "] [text_content: " + doc.Get("text_content") + "]");
              }
          }
          Console.WriteLine();

        }
    }

    public class MultiField : Fieldable
    {
        public MultiField(string name, List<Fieldable> fields, PerFieldAnalyzerWrapper analyzerWrapper)
        {
            _name = name;
            _fields = fields;
            _analyzerWrapper = analyzerWrapper;
        }

        private string _name;
        private List<Fieldable> _fields;
        private PerFieldAnalyzerWrapper _analyzerWrapper;

        #region Fieldable Members

        public void SetBoost(float boost)
        {
            //throw new Exception("The method or operation is not implemented.");
        }

        public float GetBoost()
        {
            return 1;
        }

        public string Name()
        {
            return _name;
        }

        public string StringValue()
        {
            throw new Exception("The method or operation is not implemented.");
            //StringBuilder sb = new StringBuilder();

            //foreach (var f in _fields)
            //    sb.AppendLine(f.StringValue());

            //return sb.ToString();
        }

        public System.IO.TextReader ReaderValue()
        {
            throw new Exception("The method or operation is not implemented.");
            //var combined = default(TextReader);

            //foreach (var f in _fields)
            //    combined = 
            //        combined == default(TextReader) 
            //        ? f.ReaderValue() 
            //        : combined.Union(f.ReaderValue());

            //return combined;
        }

        public byte[] BinaryValue()
        {
            throw new Exception("The method or operation is not implemented.");
            //var ms = new MemoryStream();
            //foreach (var f in _fields)
            //{
            //    var bytes = f.BinaryValue();
            //    ms.Write(bytes, 0, 0);
            //}

            //return ms.ToArray();
        }

        public TokenStream TokenStreamValue()
        {
            return new MultiTokenStream(_fields.Select(a => GetTokenStream(a)));
        }

        private TokenStream GetTokenStream(Fieldable f)
        {
            return 
                f.TokenStreamValue() 
                ?? _analyzerWrapper.TokenStream(f.Name(), new StringReader(f.StringValue()));
        }

        public bool IsStored()
        {
            return false;
        }

        public bool IsIndexed()
        {
            return true;
        }

        public bool IsTokenized()
        {
            return true;
        }

        public bool IsCompressed()
        {
            return false;
        }

        public bool IsTermVectorStored()
        {
            return true;
        }

        public bool IsStoreOffsetWithTermVector()
        {
            return true;
        }

        public bool IsStorePositionWithTermVector()
        {
            return true;
        }

        public bool IsBinary()
        {
            foreach (var f in _fields)
                if (!f.IsBinary()) return false;

            return true;
        }

        public bool GetOmitNorms()
        {
            foreach (var f in _fields)
                if (!f.GetOmitNorms()) return false;

            return true;
        }

        public void SetOmitNorms(bool omitNorms)
        {
            throw new Exception("The method or operation is not implemented.");
        }

        public void SetOmitTf(bool omitTf)
        {
            throw new Exception("The method or operation is not implemented.");
        }

        public bool GetOmitTf()
        {
            foreach (var f in _fields)
                if (!f.GetOmitTf()) return false;

            return true;
        }

        public bool IsLazy()
        {
            //foreach (var f in _fields)
            //    if (!f.GetOmitNorms()) return false;

            return true;
        }

        public int GetBinaryOffset()
        {
            return 0;
        }

        public int GetBinaryLength()
        {
            return BinaryValue().Length;
        }

        public byte[] GetBinaryValue()
        {
            return BinaryValue();
        }

        public byte[] GetBinaryValue(byte[] result)
        {
            throw new Exception("The method or operation is not implemented.");
        }

        #endregion
    }

    // this is just an example to show a different kind of token stream.. 
    public class NumericAnalyzer : Analyzer
    {
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            return new NumericTokenizer(reader);
        }
    }

    public class NumericTokenizer : CharTokenizer
    {
        public NumericTokenizer(TextReader input)
            : base(input)
        {
        }

        protected override bool IsTokenChar(char c)
        {
            // only consider numbers to be tokenizable characters
            return char.IsNumber(c);
        }
    }

    public class MultiTokenStream : TokenStream
    {
        public MultiTokenStream(IEnumerable<TokenStream> tokenStreams)
        {
            _tokenStreams = new List<TokenStream>(tokenStreams);            
        }

        private List<TokenStream> _tokenStreams;
        private IEnumerator<TokenStream> _tokenStreamEnumerator;
        private TokenStream _currentTokenStream;

        public override void ClearAttributes()
        {
            base.ClearAttributes();
            foreach (TokenStream tokenStream in _tokenStreams)
                tokenStream.ClearAttributes();
        }

        public override bool IncrementToken()
        {
            if (_tokenStreamEnumerator == null)
                _tokenStreamEnumerator = _tokenStreams.GetEnumerator();

            if (_currentTokenStream == null)
            {
                if (!_tokenStreamEnumerator.MoveNext())
                    return false;

                _currentTokenStream = _tokenStreamEnumerator.Current;
            }

            bool success = _currentTokenStream.IncrementToken();

            base.RestoreState(_currentTokenStream.CaptureState());

            if (!success)
            {
                if (!_tokenStreamEnumerator.MoveNext()) return false;
                _currentTokenStream = _tokenStreamEnumerator.Current;
                return true;
            }

            return true;
        }

        public override void Reset()
        {
            if(null != _tokenStreamEnumerator) 
                _tokenStreamEnumerator.Reset();
            _currentTokenStream = null;
        }
    }

    //// FROM: http://stackoverflow.com/questions/2925652/how-to-string-multiple-textreaders-together
    //public static class Extensions
    //{
    //    public static TextReader Union(this TextReader first, TextReader second)
    //    {
    //        return new ChainedTextReader(first, second);
    //    }

    //    private class ChainedTextReader : TextReader
    //    {
    //        private TextReader first;
    //        private TextReader second;
    //        private bool readFirst = true;

    //        public ChainedTextReader(TextReader first, TextReader second)
    //        {
    //            this.first = first;
    //            this.second = second;
    //        }

    //        public override int Peek()
    //        {
    //            if (readFirst)
    //            {
    //                return first.Peek();
    //            }
    //            else
    //            {
    //                return second.Peek();
    //            }
    //        }

    //        public override int Read()
    //        {
    //            if (readFirst)
    //            {
    //                int value = first.Read();
    //                if (value == -1)
    //                {
    //                    readFirst = false;
    //                }
    //                else
    //                {
    //                    return value;
    //                }
    //            }
    //            return second.Read();
    //        }

    //        public override void Close()
    //        {
    //            first.Close();
    //            second.Close();
    //        }

    //        protected override void Dispose(bool disposing)
    //        {
    //            base.Dispose(disposing);
    //            if (disposing)
    //            {
    //                first.Dispose();
    //                second.Dispose();
    //            }
    //        }
    //    }
    //}
}

This is available as a Gist here: https://gist.github.com/thoward/744444