//-----------------------------------------------------------------------------
// Imports
//
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Vector;
import java.util.Properties;
import java.net.URL;
import de.susebox.java.lang.ExtRuntimeException;
import de.susebox.java.util.Token;
import de.susebox.java.util.Tokenizer;
import de.susebox.java.util.TokenizerProperty;
import de.susebox.jtopas.InputStreamSource;
import de.susebox.jtopas.PluginTokenizer;
import de.susebox.jtopas.SequenceHandler;
import de.susebox.jtopas.WhitespaceHandler;
public class MuJavaPluginTokenizer
implements WhitespaceHandler, SequenceHandler
{
//---------------------------------------------------------------------------
// properties
//
/**
* The name of the test configuration file. This file will be read by
* {@link java.lang.Class#getResourceAsStream}.
*/
public static final String CONFIG_FILE = "C:\\Documents and Settings\\Maco\\workspace\\jtopas\\src\\de\\susebox\\jtopas\\TestPluginTokenizer.conf";
/**
* Property for the tests. A path to a file to use as test data source
*/
public static final String PROP_PATH = "C:/Documents and Settings/Maco/workspace/jtopas/src/de/susebox/www/";
//---------------------------------------------------------------------------
// main method
//
//---------------------------------------------------------------------------
// Fixture setup and release
//
public static void main(String[] args) {
MuJavaPluginTokenizer m=new MuJavaPluginTokenizer();
m.testContentsParsing1();
m.testContentsParsing2();
m.testContentsParsing3();
m.testContentsParsing4();
m.testContentsParsing5();
m.testContentsParsing6();
m.testContentsParsing7();
m.testContentsParsing8();
m.testContentsParsing9();
m.testContentsParsing10();
m.testContentsParsing11();
m.testContentsParsing12();
m.testContentsParsing13();
m.testContentsParsing14();
}
public String testContentsParsing1() {
String fileName=_path + "/test1.html";
return ejecuta(fileName);
}
public String testContentsParsing2() {
String fileName=_path + "/test2.html";
return ejecuta(fileName);
}
public String testContentsParsing3() {
String fileName=_path + "/test3.html";
return ejecuta(fileName);
}
public String testContentsParsing4() {
String fileName=_path + "/test4.html";
return ejecuta(fileName);
}
public String testContentsParsing5() {
String fileName=_path + "/test5.html";
return ejecuta(fileName);
}
public String testContentsParsing6() {
String fileName=_path + "/test6.html";
return ejecuta(fileName);
}
public String testContentsParsing7() {
String fileName=_path + "/test7.html";
return ejecuta(fileName);
}
public String testContentsParsing8() {
String fileName=_path + "/test8.html";
return ejecuta(fileName);
}
public String testContentsParsing9() {
String fileName=_path + "/test9.html";
return ejecuta(fileName);
}
public String testContentsParsing10() {
String fileName=_path + "/test10.html";
return ejecuta(fileName);
}
public String testContentsParsing11() {
String fileName=_path + "/test11.html";
return ejecuta(fileName);
}
public String testContentsParsing12() {
String fileName=_path + "/test12.html";
return ejecuta(fileName);
}
public String testContentsParsing13() {
String fileName=_path + "/test13.html";
return ejecuta(fileName);
}
public String testContentsParsing14() {
String fileName=_path + "/test14.html";
return ejecuta(fileName);
}
private String ejecuta(String fileName) {
try {
InputStream stream = new FileInputStream(fileName);
_reader = new InputStreamReader(stream);
PluginTokenizer tokenizer = new PluginTokenizer();
System.out.println("\nStart extracting contents in \"" + _path + "\"");
tokenizer.setSource(new InputStreamSource(_reader));
tokenizer.setParseFlags(Tokenizer.F_NO_CASE | Tokenizer.F_TOKEN_POS_ONLY);
tokenizer.setWhitespaceHandler(this);
tokenizer.setSequenceHandler(this);
String result="";
while (tokenizer.hasMoreToken()) {
Token token = tokenizer.nextToken();
//System.out.println(tokenizer.current());
result+=tokenizer.current();
}
_reader.close();
return result;
}
catch (Throwable t) {
return t.toString();
}
}
//---------------------------------------------------------------------------
// interface methods
//
/**
* When registering an instance that implements this interface, the
* {@link PluginTokenizer} will call this method to make itself known to
* the SeparatorHandler
instance in turn.
*
* @param tokenizer the controlling {@link PluginTokenizer}
*/
public void setTokenizer(PluginTokenizer tokenizer) {
_tokenizer = tokenizer;
}
/**
* This method detects the number of whitespace characters starting at the given
* position. It should use {@link de.susebox.java.util.Tokenizer#getChar} or
* {@link de.susebox.java.util.AbstractTokenizer#getCharUnchecked} to retrieve a
* character to check.
*
* The method should return the number of characters identified as whitespaces
* starting from and including the given start position.
*
* Do not attempt to actually read more data or do anything that leads to the
* change of the data source or to tokenizer switching. This is done by the
* tokenizer framework.
*
* @param startingAtPos start checking for whitespace from this position
* @param maxChars if there is no non-whitespace character, read up to this number of characters
* @return number of whitespace characters starting from the given offset
* @throws TokenizerException failure while reading data from the input stream
*/
public int readWhitespaces(int startingAtPos, int maxChars) {
int pos = startingAtPos;
int endPos = startingAtPos + maxChars;
while (pos < endPos) {
if ( ! isWhitespace(_tokenizer.getCharUnchecked(pos))) {
break;
}
pos++;
}
return pos - startingAtPos;
}
/**
* This method checks if the character is a whitespace.
*
* @param testChar check this character
* @return true
if the given character is a whitespace,
* false
otherwise
*/
public boolean isWhitespace(char testChar) {
switch (testChar) {
case ' ':
case '\t':
case '\r':
case '\n':
return true;
default:
return false;
}
}
/**
* Return a {@link de.susebox.java.util.TokenizerProperty} if the character
* starting at the given position comprise a special sequence (like the ++ operator
* in C and Java or the &bsp; in HTML), a comment starting sequence or
* a string sign.
* Return null
if no special sequence is present at the given
* position.
* Use the {@link de.susebox.java.util.AbstractTokenizer#getCharUnchecked} to
* retrieve a character from the tokenizers input buffer.
*
* @param startingAtPos check from this position in the tokenizers input buffer
* @return a TokenizerProperty
instance describing the special sequence,
* comment etc. or null
if no such thing was found.
*/
public TokenizerProperty isSequenceCommentOrString(int startingAtPos, int maxChars) {
TokenizerProperty prop = null;
String text;
char nextChar;
switch (_tokenizer.getCharUnchecked(startingAtPos)) {
// handling a variety of HTML tags
case '<':
if (maxChars >= 2) {
nextChar = _tokenizer.getCharUnchecked(startingAtPos + 1);
switch (nextChar) {
case '!':
if ( maxChars >= 4
&& _tokenizer.getCharUnchecked(startingAtPos + 2) == '-'
&& _tokenizer.getCharUnchecked(startingAtPos + 3) == '-') {
prop = HTML_COMMENT;
}
break;
case 'H':
case 'h':
if ( maxChars >= 6
&& _tokenizer.getText(startingAtPos + 2, 4).compareToIgnoreCase("EAD>") == 0) {
prop = HEAD_COMMENT;
}
break;
case 'c':
case 'C':
if ( maxChars >= 6
&& _tokenizer.getText(startingAtPos + 2, 4).compareToIgnoreCase("ODE>") == 0) {
prop = CODE_TAG;
}
break;
case 'p':
case 'P':
if ( maxChars >= 5
&& _tokenizer.getText(startingAtPos + 2, 3).compareToIgnoreCase("RE>") == 0) {
prop = PRE_TAG;
}
break;
case '/':
if (maxChars >= 7) {
text = _tokenizer.getText(startingAtPos + 2, 5);
} else if ( maxChars >= 6) {
text = _tokenizer.getText(startingAtPos + 2, 4);
} else if (maxChars >= 4) {
text = _tokenizer.getText(startingAtPos + 2, 2);
} else {
break; // this is a not expected situation (EOF in tag)
}
if (text.compareToIgnoreCase("B>") == 0) {
prop = BOLD_END_TAG;
} else if (text.compareToIgnoreCase("I>") == 0) {
prop = ITALIC_END_TAG;
} else if (text.compareToIgnoreCase("CODE>") == 0) {
prop = CODE_END_TAG;
} else if (text.compareToIgnoreCase("PRE>") == 0) {
prop = PRE_END_TAG;
}
break;
case 'b':
case 'B':
if ( maxChars >= 3
&& _tokenizer.getCharUnchecked(startingAtPos + 2) == '>') {
prop = BOLD_TAG;
}
break;
case 'i':
case 'I':
if ( maxChars >= 3
&& _tokenizer.getCharUnchecked(startingAtPos + 2) == '>') {
prop = ITALIC_TAG;
}
break;
}
}
// no special tag found - its simply a tag regarded as a comment
if (prop == null) {
prop = TAG_COMMENT;
}
break;
// handling special character encodings
case '&':
if (maxChars >= 8) {
text = _tokenizer.getText(startingAtPos + 1, 7);
} else if (maxChars >= 7) {
text = _tokenizer.getText(startingAtPos + 1, 6);
} else if (maxChars >= 6) {
text = _tokenizer.getText(startingAtPos + 1, 5);
} else {
break; // this is a not expected situation (EOF in special character)
}
if (text.compareToIgnoreCase("auml;") == 0) {
prop = AUML_TAG;
} else if (text.compareToIgnoreCase("ouml;") == 0) {
prop = OUML_TAG;
} else if (text.compareToIgnoreCase("uuml;") == 0) {
prop = UUML_TAG;
} else if (text.compareToIgnoreCase("szlig;") == 0) {
prop = SZLIG_TAG;
} else if (text.compareToIgnoreCase("amp;lt;") == 0) {
prop = LT_TAG;
} else if (text.compareToIgnoreCase("amp;gt;") == 0) {
prop = GT_TAG;
}
break;
}
// either we found one or the initial null is returned
return prop;
}
/**
* This method is called by the parent {@link PluginTokenizer} to learn how
* many characters are needed by an instance of this interface to identify a
* special sequence in the worst case. Usually that should be the length of
* the longest possible special sequence, comment prefix etc.
* The tokenizer will make sure that at least this number of characters is
* available when {@link SequenceHandler#isSequenceCommentOrString} is called.
* If less characters are provided, EOF is reached.
*
* @return the number of characters needed in the worst case to identify a
* special sequence
*/
public int getSequenceMaxLength() {
return 8; // length of "<"
}
//---------------------------------------------------------------------------
// Members
//
private InputStreamReader _reader = null;
private String _path = "C:/Documents and Settings/Maco/workspace/jtopas/src/de/susebox/www";
private PluginTokenizer _tokenizer = null;
//---------------------------------------------------------------------------
// Constants
//
private static final Object PRE_START_COMPANION = new Object();
private static final Object PRE_END_COMPANION = new Object();
private static final TokenizerProperty TAG_COMMENT
= new TokenizerProperty(Token.BLOCK_COMMENT, new String[] { "<", ">" }, null );
private static final TokenizerProperty HEAD_COMMENT
= new TokenizerProperty(Token.BLOCK_COMMENT, new String[] { "
" }, "", Tokenizer.F_NO_CASE );
private static final TokenizerProperty CODE_END_TAG
= new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "
" }, "", Tokenizer.F_NO_CASE );
private static final TokenizerProperty PRE_TAG
= new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "" }, PRE_START_COMPANION, Tokenizer.F_NO_CASE ); private static final TokenizerProperty PRE_END_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "" }, PRE_END_COMPANION, Tokenizer.F_NO_CASE ); private static final TokenizerProperty LT_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "<" }, "<", Tokenizer.F_NO_CASE ); private static final TokenizerProperty GT_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { ">" }, ">", Tokenizer.F_NO_CASE ); private static final TokenizerProperty AUML_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "ä" }, "ä", Tokenizer.F_NO_CASE ); private static final TokenizerProperty OUML_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "ö" }, "ö", Tokenizer.F_NO_CASE ); private static final TokenizerProperty UUML_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "ü" }, "ü", Tokenizer.F_NO_CASE ); private static final TokenizerProperty SZLIG_TAG = new TokenizerProperty(Token.SPECIAL_SEQUENCE, new String[] { "ß" }, "ß", Tokenizer.F_NO_CASE ); }