Public Class Public Static Int : Scanner - Inner - Buffer 1024 Delimiter
Public Class Public Static Int : Scanner - Inner - Buffer 1024 Delimiter
java.util.Scanner is a
great tool for parsing but it has
some disadvantages. One of
them is
unchangable buffer with
length 1024. It means that
working with strings bigger
than 1024 will not
be correct - in fact only
the first 1024 symbols will be
scanned. Also java.util.Scanner
class is final,
so overriding methods is
not awailable.
Here is a draft
implementation of solution
when whole text is splitted to N
parts on lexeme the nearest to
n*1024th symbol and
each part is scanned
seperately. In this example
found lexeme is being replaced
with upper case
and enclosed with "<>"
*/
public class LargeStringScanner
{
public static int
SCANNER_INNER_BUFFER =
1024; //inner buffer of
java.util.scanner
public static String DELIMITER
= "\\b"; //lexeme delimiter
regex
public String enhance(String
body) {
if (body == null ||
body.isEmpty) return;
String enhancedBody = "";
int bodyPartStartPosition =
0;
while
(bodyPartStartPosition <
body.length()) {
int bodyPartEndPosition
= bodyPartStartPosition +
SCANNER_INNER_BUFFER;
bodyPartEndPosition =
body.length() >
bodyPartEndPosition ?
bodyPartEndPosition - 1 :
body.length();
bodyPartEndPosition =
findLastDelimiterInSubString(bo
dyPartStartPosition,
bodyPartEndPosition, body);
String subBody =
body.substring(bodyPartStartPo
sition, bodyPartEndPosition);
Scanner scanner = new
Scanner(subBody).useDelimiter
(Pattern.compile(DELIMITER));
String
enhancedBodyPart = "";
int charsRead = 0;
while
(scanner.hasNext()) {
String word =
scanner.next();
enhancedBodyPart
+=
subBody.substring(charsRead,
scanner.match().start());
word =
doSmthWithLexeme(word);
enhancedBodyPart
+= word;
charsRead =
scanner.match().end();
}
enhancedBodyPart +=
subBody.substring(charsRead);
enhancedBody +=
enhancedBodyPart;
bodyPartStartPosition =
bodyPartEndPosition;
}
return enhancedBody;
}
private int
findLastDelimiterInSubString(int
startPosition, int endPosition,
String largeString) {
for (int i = endPosition - 1; i
> startPosition; i--) {
if
(Pattern.matches(DELIMITER,
Character.toString(largeString.c
harAt(i)))) {
endPosition = i;
break;
}
}
return endPosition;
}
protected String
doSmthWithLexeme(String
lexeme) {
return "<" +
lexeme.toUpperCase() + ">";
}
}