//----------------------------------------------------------------------------- // Imports // import java.io.InputStream; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.Vector; import java.util.Properties; import java.net.URL; import de.susebox.java.lang.ExtRuntimeException; import de.susebox.java.util.Token; import de.susebox.java.util.Tokenizer; import de.susebox.java.util.TokenizerProperty; import de.susebox.jtopas.InputStreamSource; import de.susebox.jtopas.PluginTokenizer; import de.susebox.jtopas.SequenceHandler; import de.susebox.jtopas.WhitespaceHandler; public class MuJavaPluginTokenizer implements WhitespaceHandler, SequenceHandler { //--------------------------------------------------------------------------- // properties // /** * The name of the test configuration file. This file will be read by * {@link java.lang.Class#getResourceAsStream}. */ public static final String CONFIG_FILE = "C:\\Documents and Settings\\Maco\\workspace\\jtopas\\src\\de\\susebox\\jtopas\\TestPluginTokenizer.conf"; /** * Property for the tests. A path to a file to use as test data source */ public static final String PROP_PATH = "C:/Documents and Settings/Maco/workspace/jtopas/src/de/susebox/www/"; //--------------------------------------------------------------------------- // main method // //--------------------------------------------------------------------------- // Fixture setup and release // public static void main(String[] args) { MuJavaPluginTokenizer m=new MuJavaPluginTokenizer(); m.testContentsParsing1(); m.testContentsParsing2(); m.testContentsParsing3(); m.testContentsParsing4(); m.testContentsParsing5(); m.testContentsParsing6(); m.testContentsParsing7(); m.testContentsParsing8(); m.testContentsParsing9(); m.testContentsParsing10(); m.testContentsParsing11(); m.testContentsParsing12(); m.testContentsParsing13(); m.testContentsParsing14(); } public String testContentsParsing1() { String fileName=_path + "/test1.html"; return ejecuta(fileName); } public String testContentsParsing2() { String fileName=_path + "/test2.html"; return ejecuta(fileName); } public String testContentsParsing3() { String fileName=_path + "/test3.html"; return ejecuta(fileName); } public String testContentsParsing4() { String fileName=_path + "/test4.html"; return ejecuta(fileName); } public String testContentsParsing5() { String fileName=_path + "/test5.html"; return ejecuta(fileName); } public String testContentsParsing6() { String fileName=_path + "/test6.html"; return ejecuta(fileName); } public String testContentsParsing7() { String fileName=_path + "/test7.html"; return ejecuta(fileName); } public String testContentsParsing8() { String fileName=_path + "/test8.html"; return ejecuta(fileName); } public String testContentsParsing9() { String fileName=_path + "/test9.html"; return ejecuta(fileName); } public String testContentsParsing10() { String fileName=_path + "/test10.html"; return ejecuta(fileName); } public String testContentsParsing11() { String fileName=_path + "/test11.html"; return ejecuta(fileName); } public String testContentsParsing12() { String fileName=_path + "/test12.html"; return ejecuta(fileName); } public String testContentsParsing13() { String fileName=_path + "/test13.html"; return ejecuta(fileName); } public String testContentsParsing14() { String fileName=_path + "/test14.html"; return ejecuta(fileName); } private String ejecuta(String fileName) { try { InputStream stream = new FileInputStream(fileName); _reader = new InputStreamReader(stream); PluginTokenizer tokenizer = new PluginTokenizer(); System.out.println("\nStart extracting contents in \"" + _path + "\""); tokenizer.setSource(new InputStreamSource(_reader)); tokenizer.setParseFlags(Tokenizer.F_NO_CASE | Tokenizer.F_TOKEN_POS_ONLY); tokenizer.setWhitespaceHandler(this); tokenizer.setSequenceHandler(this); String result=""; while (tokenizer.hasMoreToken()) { Token token = tokenizer.nextToken(); //System.out.println(tokenizer.current()); result+=tokenizer.current(); } _reader.close(); return result; } catch (Throwable t) { return t.toString(); } } //--------------------------------------------------------------------------- // interface methods // /** * When registering an instance that implements this interface, the * {@link PluginTokenizer} will call this method to make itself known to * the SeparatorHandler instance in turn. * * @param tokenizer the controlling {@link PluginTokenizer} */ public void setTokenizer(PluginTokenizer tokenizer) { _tokenizer = tokenizer; } /** * This method detects the number of whitespace characters starting at the given * position. It should use {@link de.susebox.java.util.Tokenizer#getChar} or * {@link de.susebox.java.util.AbstractTokenizer#getCharUnchecked} to retrieve a * character to check. *
* The method should return the number of characters identified as whitespaces * starting from and including the given start position. *
* Do not attempt to actually read more data or do anything that leads to the * change of the data source or to tokenizer switching. This is done by the * tokenizer framework. * * @param startingAtPos start checking for whitespace from this position * @param maxChars if there is no non-whitespace character, read up to this number of characters * @return number of whitespace characters starting from the given offset * @throws TokenizerException failure while reading data from the input stream */ public int readWhitespaces(int startingAtPos, int maxChars) { int pos = startingAtPos; int endPos = startingAtPos + maxChars; while (pos < endPos) { if ( ! isWhitespace(_tokenizer.getCharUnchecked(pos))) { break; } pos++; } return pos - startingAtPos; } /** * This method checks if the character is a whitespace. * * @param testChar check this character * @return true if the given character is a whitespace, * false otherwise */ public boolean isWhitespace(char testChar) { switch (testChar) { case ' ': case '\t': case '\r': case '\n': return true; default: return false; } } /** * Return a {@link de.susebox.java.util.TokenizerProperty} if the character * starting at the given position comprise a special sequence (like the ++ operator * in C and Java or the &bsp; in HTML), a comment starting sequence or * a string sign.
* Return null if no special sequence is present at the given * position.
* Use the {@link de.susebox.java.util.AbstractTokenizer#getCharUnchecked} to * retrieve a character from the tokenizers input buffer. * * @param startingAtPos check from this position in the tokenizers input buffer * @return a TokenizerProperty instance describing the special sequence, * comment etc. or null if no such thing was found. */ public TokenizerProperty isSequenceCommentOrString(int startingAtPos, int maxChars) { TokenizerProperty prop = null; String text; char nextChar; switch (_tokenizer.getCharUnchecked(startingAtPos)) { // handling a variety of HTML tags case '<': if (maxChars >= 2) { nextChar = _tokenizer.getCharUnchecked(startingAtPos + 1); switch (nextChar) { case '!': if ( maxChars >= 4 && _tokenizer.getCharUnchecked(startingAtPos + 2) == '-' && _tokenizer.getCharUnchecked(startingAtPos + 3) == '-') { prop = HTML_COMMENT; } break; case 'H': case 'h': if ( maxChars >= 6 && _tokenizer.getText(startingAtPos + 2, 4).compareToIgnoreCase("EAD>") == 0) { prop = HEAD_COMMENT; } break; case 'c': case 'C': if ( maxChars >= 6 && _tokenizer.getText(startingAtPos + 2, 4).compareToIgnoreCase("ODE>") == 0) { prop = CODE_TAG; } break; case 'p': case 'P': if ( maxChars >= 5 && _tokenizer.getText(startingAtPos + 2, 3).compareToIgnoreCase("RE>") == 0) { prop = PRE_TAG; } break; case '/': if (maxChars >= 7) { text = _tokenizer.getText(startingAtPos + 2, 5); } else if ( maxChars >= 6) { text = _tokenizer.getText(startingAtPos + 2, 4); } else if (maxChars >= 4) { text = _tokenizer.getText(startingAtPos + 2, 2); } else { break; // this is a not expected situation (EOF in tag) } if (text.compareToIgnoreCase("B>") == 0) { prop = BOLD_END_TAG; } else if (text.compareToIgnoreCase("I>") == 0) { prop = ITALIC_END_TAG; } else if (text.compareToIgnoreCase("CODE>") == 0) { prop = CODE_END_TAG; } else if (text.compareToIgnoreCase("PRE>") == 0) { prop = PRE_END_TAG; } break; case 'b': case 'B': if ( maxChars >= 3 && _tokenizer.getCharUnchecked(startingAtPos + 2) == '>') { prop = BOLD_TAG; } break; case 'i': case 'I': if ( maxChars >= 3 && _tokenizer.getCharUnchecked(startingAtPos + 2) == '>') { prop = ITALIC_TAG; } break; } } // no special tag found - its simply a tag regarded as a comment if (prop == null) { prop = TAG_COMMENT; } break; // handling special character encodings case '&': if (maxChars >= 8) { text = _tokenizer.getText(startingAtPos + 1, 7); } else if (maxChars >= 7) { text = _tokenizer.getText(startingAtPos + 1, 6); } else if (maxChars >= 6) { text = _tokenizer.getText(startingAtPos + 1, 5); } else { break; // this is a not expected situation (EOF in special character) } if (text.compareToIgnoreCase("auml;") == 0) { prop = AUML_TAG; } else if (text.compareToIgnoreCase("ouml;") == 0) { prop = OUML_TAG; } else if (text.compareToIgnoreCase("uuml;") == 0) { prop = UUML_TAG; } else if (text.compareToIgnoreCase("szlig;") == 0) { prop = SZLIG_TAG; } else if (text.compareToIgnoreCase("amp;lt;") == 0) { prop = LT_TAG; } else if (text.compareToIgnoreCase("amp;gt;") == 0) { prop = GT_TAG; } break; } // either we found one or the initial null is returned return prop; } /** * This method is called by the parent {@link PluginTokenizer} to learn how * many characters are needed by an instance of this interface to identify a * special sequence in the worst case. Usually that should be the length of * the longest possible special sequence, comment prefix etc. * The tokenizer will make sure that at least this number of characters is * available when {@link SequenceHandler#isSequenceCommentOrString} is called. * If less characters are provided, EOF is reached. * * @return the number of characters needed in the worst case to identify a * special sequence */ public int getSequenceMaxLength() { return 8; // length of "&lt;" } //--------------------------------------------------------------------------- // Members // private InputStreamReader _reader = null; private String _path = "C:/Documents and Settings/Maco/workspace/jtopas/src/de/susebox/www"; private PluginTokenizer _tokenizer = null; //--------------------------------------------------------------------------- // Constants // private static final Object PRE_START_COMPANION = new Object(); private static final Object PRE_END_COMPANION = new Object(); private static final TokenizerProperty TAG_COMMENT = new TokenizerProperty(Token.BLOCK_COMMENT, new String[] { "<", ">" }, null ); private static final TokenizerProperty HEAD_COMMENT = new TokenizerProperty(Token.BLOCK_COMMENT, new String[] { "", "" }, null, Tokenizer.F_NO_CASE ); private static final TokenizerProperty HTML_COMMENT = new TokenizerProperty(Token.BLOCK_COMMENT, new String[] { "" }, null ); private static final TokenizerProperty BOLD_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "" }, "", Tokenizer.F_NO_CASE ); private static final TokenizerProperty BOLD_END_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "" }, "", Tokenizer.F_NO_CASE ); private static final TokenizerProperty ITALIC_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "" }, "", Tokenizer.F_NO_CASE ); private static final TokenizerProperty ITALIC_END_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "" }, "", Tokenizer.F_NO_CASE ); private static final TokenizerProperty CODE_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "" }, "", Tokenizer.F_NO_CASE ); private static final TokenizerProperty CODE_END_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "" }, "", Tokenizer.F_NO_CASE ); private static final TokenizerProperty PRE_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "
" }, PRE_START_COMPANION, Tokenizer.F_NO_CASE );

  private static final TokenizerProperty  PRE_END_TAG 
    = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "
" }, PRE_END_COMPANION, Tokenizer.F_NO_CASE ); private static final TokenizerProperty LT_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "&lt;" }, "<", Tokenizer.F_NO_CASE ); private static final TokenizerProperty GT_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "&gt;" }, ">", Tokenizer.F_NO_CASE ); private static final TokenizerProperty AUML_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "ä" }, "ä", Tokenizer.F_NO_CASE ); private static final TokenizerProperty OUML_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "ö" }, "ö", Tokenizer.F_NO_CASE ); private static final TokenizerProperty UUML_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "ü" }, "ü", Tokenizer.F_NO_CASE ); private static final TokenizerProperty SZLIG_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "ß" }, "ß", Tokenizer.F_NO_CASE ); }