//-----------------------------------------------------------------------------
// Imports
//
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Vector;
import java.util.Properties;
import java.net.URL;

import de.susebox.java.lang.ExtRuntimeException;
import de.susebox.java.util.Token;
import de.susebox.java.util.Tokenizer;
import de.susebox.java.util.TokenizerProperty;
import de.susebox.jtopas.InputStreamSource;
import de.susebox.jtopas.PluginTokenizer;
import de.susebox.jtopas.SequenceHandler;
import de.susebox.jtopas.WhitespaceHandler;


public class MuJavaPluginTokenizer 
  implements  WhitespaceHandler, SequenceHandler
{
  
  //---------------------------------------------------------------------------
  // properties
  //

  /**
   * The name of the test configuration file. This file will be read by 
   * {@link java.lang.Class#getResourceAsStream}.
   */
  public static final String CONFIG_FILE = "C:\\Documents and Settings\\Maco\\workspace\\jtopas\\src\\de\\susebox\\jtopas\\TestPluginTokenizer.conf";
  
  /**
   * Property for the tests. A path to a file to use as test data source
   */
  public static final String PROP_PATH = "C:/Documents and Settings/Maco/workspace/jtopas/src/de/susebox/www/";
  
  
  //---------------------------------------------------------------------------
  // main method
  //
  
   
  //---------------------------------------------------------------------------
  // Fixture setup and release
  //
  public static void main(String[] args) {
	  MuJavaPluginTokenizer m=new MuJavaPluginTokenizer();
	  m.testContentsParsing1();
	  m.testContentsParsing2();
	  m.testContentsParsing3();
	  m.testContentsParsing4();
	  m.testContentsParsing5();
	  m.testContentsParsing6();
	  m.testContentsParsing7();
	  m.testContentsParsing8();
	  m.testContentsParsing9();
	  m.testContentsParsing10();
	  m.testContentsParsing11();
	  m.testContentsParsing12();
	  m.testContentsParsing13();
	  m.testContentsParsing14();
  }
   
  public String testContentsParsing1() {
	  String fileName=_path + "/test1.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing2() {
	  String fileName=_path + "/test2.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing3() {
	  String fileName=_path + "/test3.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing4() {
	  String fileName=_path + "/test4.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing5() {
	  String fileName=_path + "/test5.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing6() {
	  String fileName=_path + "/test6.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing7() {
	  String fileName=_path + "/test7.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing8() {
	  String fileName=_path + "/test8.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing9() {
	  String fileName=_path + "/test9.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing10() {
	  String fileName=_path + "/test10.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing11() {
	  String fileName=_path + "/test11.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing12() {
	  String fileName=_path + "/test12.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing13() {
	  String fileName=_path + "/test13.html";
	  return ejecuta(fileName);
  }
  
  public String testContentsParsing14() {
	  String fileName=_path + "/test14.html";
	  return ejecuta(fileName);
  }

private String ejecuta(String fileName) {
	try {
	  InputStream  stream = new FileInputStream(fileName);
	    
	    _reader = new InputStreamReader(stream);
    PluginTokenizer tokenizer = new PluginTokenizer();

    System.out.println("\nStart extracting contents in \"" + _path + "\"");
    
    tokenizer.setSource(new InputStreamSource(_reader));
    tokenizer.setParseFlags(Tokenizer.F_NO_CASE | Tokenizer.F_TOKEN_POS_ONLY);
    tokenizer.setWhitespaceHandler(this);
    tokenizer.setSequenceHandler(this);
    String result="";
    while (tokenizer.hasMoreToken()) {
      Token token = tokenizer.nextToken();
      //System.out.println(tokenizer.current());
      result+=tokenizer.current();
    }
    _reader.close();
    return result;
	  }
	  catch (Throwable t) {
		  return t.toString();
	  }
}
  
 
  //---------------------------------------------------------------------------
  // interface methods
  //

  /**
   * When registering an instance that implements this interface, the
   * {@link PluginTokenizer} will call this method to make itself known to
   * the <code>SeparatorHandler</code> instance in turn.
   *
   * @param tokenizer   the controlling {@link PluginTokenizer}
   */
  public void setTokenizer(PluginTokenizer tokenizer) {
    _tokenizer = tokenizer;
  }
  
  /**
   * This method detects the number of whitespace characters starting at the given
   * position. It should use {@link de.susebox.java.util.Tokenizer#getChar} or 
   * {@link de.susebox.java.util.AbstractTokenizer#getCharUnchecked} to retrieve a 
   * character to check.
   * <br>
   * The method should return the number of characters identified as whitespaces
   * starting from and including the given start position.
   * <br>
   * Do not attempt to actually read more data or do anything that leads to the
   * change of the data source or to tokenizer switching. This is done by the
   * tokenizer framework.
   *
   * @param   startingAtPos  start checking for whitespace from this position
   * @param   maxChars  if there is no non-whitespace character, read up to this number of characters
   * @return  number of whitespace characters starting from the given offset
   * @throws  TokenizerException failure while reading data from the input stream
   */
  public int readWhitespaces(int startingAtPos, int maxChars) {
    int pos    = startingAtPos;
    int endPos = startingAtPos + maxChars;
    
    while (pos < endPos) {
      if ( ! isWhitespace(_tokenizer.getCharUnchecked(pos))) {
        break;
      }
      pos++;
    }
    return pos - startingAtPos;
  }

  
  /**
   * This method checks if the character is a whitespace.
   *
   * @param testChar  check this character
   * @return <CODE>true</CODE> if the given character is a whitespace,
   *        <CODE>false</CODE> otherwise
   */
  public boolean isWhitespace(char testChar) {
    switch (testChar) {
      case ' ':
      case '\t':
      case '\r':
      case '\n':
        return true;
      default:
        return false;
    }
  }
  
  /**
   * Return a {@link de.susebox.java.util.TokenizerProperty} if the character
   * starting at the given position comprise a special sequence (like the ++ operator
   * in C and Java or the &amp;bsp; in HTML), a comment starting sequence or
   * a string sign.<br>
   * Return <code>null</code> if no special sequence is present at the given
   * position.<br>
   * Use the {@link de.susebox.java.util.AbstractTokenizer#getCharUnchecked} to
   * retrieve a character from the tokenizers input buffer.
   *
   * @param  startingAtPos check from this position in the tokenizers input buffer
   * @return a <code>TokenizerProperty</code> instance describing the special sequence,
   *        comment etc. or <code>null</code> if no such thing was found.
   */
  public TokenizerProperty isSequenceCommentOrString(int startingAtPos, int maxChars) {
    TokenizerProperty prop = null;
    String            text;
    char              nextChar;
    
    switch (_tokenizer.getCharUnchecked(startingAtPos)) {
    // handling a variety of HTML tags
    case '<':
      if (maxChars >= 2) {
        nextChar = _tokenizer.getCharUnchecked(startingAtPos + 1);
        switch (nextChar) {
        case '!':
          if (   maxChars >= 4 
              && _tokenizer.getCharUnchecked(startingAtPos + 2) == '-'
              && _tokenizer.getCharUnchecked(startingAtPos + 3) == '-') {
            prop = HTML_COMMENT;
          }
          break;
        case 'H':
        case 'h':
          if (   maxChars >= 6 
              && _tokenizer.getText(startingAtPos + 2, 4).compareToIgnoreCase("EAD>") == 0) {
            prop = HEAD_COMMENT;
          }
          break;
        case 'c':
        case 'C':
          if (   maxChars >= 6 
              && _tokenizer.getText(startingAtPos + 2, 4).compareToIgnoreCase("ODE>") == 0) {
            prop = CODE_TAG;
          }
          break;
        case 'p':
        case 'P':
          if (   maxChars >= 5
              && _tokenizer.getText(startingAtPos + 2, 3).compareToIgnoreCase("RE>") == 0) {
            prop = PRE_TAG;
          }
          break;
        case '/':
          if (maxChars >= 7) {
            text = _tokenizer.getText(startingAtPos + 2, 5);
          } else if (  maxChars >= 6) {
            text = _tokenizer.getText(startingAtPos + 2, 4);
          } else if (maxChars >= 4) {
            text = _tokenizer.getText(startingAtPos + 2, 2);
          } else {
            break;    // this is a not expected situation (EOF in tag)
          }
          if (text.compareToIgnoreCase("B>") == 0) {
            prop = BOLD_END_TAG;
          } else if (text.compareToIgnoreCase("I>") == 0) {
            prop = ITALIC_END_TAG;
          } else if (text.compareToIgnoreCase("CODE>") == 0) {
            prop = CODE_END_TAG;
          } else if (text.compareToIgnoreCase("PRE>") == 0) {
            prop = PRE_END_TAG;
          }
          break;
        case 'b':
        case 'B':
          if (   maxChars >= 3
              && _tokenizer.getCharUnchecked(startingAtPos + 2) == '>') {
            prop = BOLD_TAG;
          }
          break;
        case 'i':
        case 'I':
          if (   maxChars >= 3
              && _tokenizer.getCharUnchecked(startingAtPos + 2) == '>') {
            prop = ITALIC_TAG;
          }
          break;
        }
      }

      // no special tag found - its simply a tag regarded as a comment
      if (prop == null) {
        prop = TAG_COMMENT;
      }
      break;

    // handling special character encodings
    case '&':
      if (maxChars >= 8) {
        text = _tokenizer.getText(startingAtPos + 1, 7);
      } else if (maxChars >= 7) {
        text = _tokenizer.getText(startingAtPos + 1, 6);
      } else if (maxChars >= 6) {
        text = _tokenizer.getText(startingAtPos + 1, 5);
      } else {
        break;    // this is a not expected situation (EOF in special character)
      }
      if (text.compareToIgnoreCase("auml;") == 0) {
        prop = AUML_TAG;
      } else if (text.compareToIgnoreCase("ouml;") == 0) {
        prop = OUML_TAG;
      } else if (text.compareToIgnoreCase("uuml;") == 0) {
        prop = UUML_TAG;
      } else if (text.compareToIgnoreCase("szlig;") == 0) {
        prop = SZLIG_TAG;
      } else if (text.compareToIgnoreCase("amp;lt;") == 0) {
        prop = LT_TAG;
      } else if (text.compareToIgnoreCase("amp;gt;") == 0) {
        prop = GT_TAG;
      }
      break;
    }
  
    // either we found one or the initial null is returned
    return prop;
  }
  
  
  /**
   * This method is called by the parent {@link PluginTokenizer} to learn how
   * many characters are needed by an instance of this interface to identify a
   * special sequence in the worst case. Usually that should be the length of
   * the longest possible special sequence, comment prefix etc.
   * The tokenizer will make sure that at least this number of characters is
   * available when {@link SequenceHandler#isSequenceCommentOrString} is called. 
   * If less characters are provided, EOF is reached.
   *
   * @return  the number of characters needed in the worst case to identify a
   *          special sequence
   */
  public int getSequenceMaxLength() {
    return 8;   // length of "&amp;lt;"
  }
  
  
  //---------------------------------------------------------------------------
  // Members
  //
  private InputStreamReader _reader     = null;
  private String            _path       = "C:/Documents and Settings/Maco/workspace/jtopas/src/de/susebox/www";
  private PluginTokenizer   _tokenizer  = null;
  
  
  //---------------------------------------------------------------------------
  // Constants
  //
  private static final Object PRE_START_COMPANION = new Object();
  private static final Object PRE_END_COMPANION   = new Object();
  
  private static final TokenizerProperty  TAG_COMMENT 
    = new TokenizerProperty(Token.BLOCK_COMMENT, new String[] { "<", ">" }, null );

  private static final TokenizerProperty  HEAD_COMMENT 
    = new TokenizerProperty(Token.BLOCK_COMMENT, new String[] { "<HEAD>", "</HEAD>" }, null, Tokenizer.F_NO_CASE );
    
  private static final TokenizerProperty  HTML_COMMENT 
    = new TokenizerProperty(Token.BLOCK_COMMENT, new String[] { "<!--", "-->" }, null );
    
  private static final TokenizerProperty  BOLD_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "<b>" }, "", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  BOLD_END_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "</b>" }, "", Tokenizer.F_NO_CASE );
    
  private static final TokenizerProperty  ITALIC_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "<i>" }, "", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  ITALIC_END_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "</i>" }, "", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  CODE_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "<code>" }, "", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  CODE_END_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "</code>" }, "", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  PRE_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "<pre>" }, PRE_START_COMPANION, Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  PRE_END_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "</pre>" }, PRE_END_COMPANION, Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  LT_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "&amp;lt;" }, "<", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  GT_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "&amp;gt;" }, ">", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  AUML_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "&auml;" }, "ä", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  OUML_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "&ouml;" }, "ö", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  UUML_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "&uuml;" }, "ü", Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  SZLIG_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "&szlig;" }, "ß", Tokenizer.F_NO_CASE );
}