Parsing
This section is intended to review all the approches to find, token and format stuff.
Let's see the first example:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.ocjp.regex; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class RegexFirst { | |
public static void main(String[] args) { | |
Pattern p = Pattern.compile("aba"); | |
Matcher m = p.matcher("abababa"); // index starts from 0 | |
while ( m.find() ) { | |
System.out.print(m.start() + " "); // Prints 0 4 | |
} | |
} | |
} |
As you can see the mehtod matcher from the class Matcher gets a source and the class Pattern uses the method compile to handle a pattern that you want to search.
!Important:
a b a b a b a
0 1 2 3 4 5 6
Why the little program showed above didn't print 0 2 4?
The reason is the regex engine does not consider the index 2 because it was consumed, and cannot be reused, but there are expections for this rule and will be shown sooner.
a b a
0 1 2
Using Metacharacters
\d A digit
\s A whitespace character
\w A word character (letters, digits, or "_" (underscore))
. Any character
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.ocjp.regex; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class RegexMeta { | |
public static void main(String[] args) { | |
Pattern[] p = new Pattern[7]; | |
p[0] = Pattern.compile("\\d"); | |
p[1] = Pattern.compile("\\s"); | |
p[2] = Pattern.compile("\\w"); | |
p[3] = Pattern.compile("."); | |
p[4] = Pattern.compile("[abc]"); | |
p[5] = Pattern.compile("[a-f]"); | |
p[6] = Pattern.compile("[a-fA-F]"); | |
Matcher[] matchers = new Matcher[7]; | |
for ( int i = 0; i < matchers.length ; i++ ) { | |
matchers[i] = p[i].matcher("abcfd5 9erARqf C1#"); | |
// abcfd5 9erARqf C1# | |
// 012345678901234567 | |
System.out.println("Matcher ==> " + i); | |
while ( matchers[i].find() ) { | |
System.out.print(matchers[i].start() + " "); | |
} | |
System.out.println(""); | |
} | |
} | |
} |
Output
Matcher ==> 0
5 7 16
Matcher ==> 1
6 14
Matcher ==> 2
0 1 2 3 4 5 7 8 9 10 11 12 13 15 16
Matcher ==> 3
0 1 2
Matcher ==> 4
0 1 2 3 4 8 13
Matcher ==> 5
0 1 2 3 4 8 10 13 15
Using Quantifiers
+ One or more occurrences
* Zero or more occurrences
? Zero or one occurrence
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.ocjp.regex; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class RegexQuantifiers { | |
public static void main(String[] args) { | |
Pattern[] p = new Pattern[3]; | |
p[0] = Pattern.compile("0[xX]([0-9a-fA-F])+"); | |
p[1] = Pattern.compile("proj1([^,])*"); | |
p[2] = Pattern.compile("\\d\\d\\d([-\\s])?\\d\\d\\d\\d"); // ? which means Zero or one occurrence | |
Matcher[] matchers = new Matcher[3]; | |
for ( int i = 0; i < matchers.length ; i++ ) { | |
if ( i == 0 ) { | |
matchers[i] = p[i].matcher("12 0x 0x12 0Xf 0xg"); | |
} else if ( i == 1) { | |
matchers[i] = p[i].matcher("proj3.txt,proj1sched.pdf,proj1,proj2,proj1.java"); | |
} else { | |
matchers[i] = p[i].matcher("1234567,123-4567,123 4567"); | |
} | |
System.out.println("Matcher ==> " + i); | |
while ( matchers[i].find() ) { | |
System.out.println(matchers[i].start() + " " + matchers[i].group()); | |
} | |
System.out.println(""); | |
} | |
} | |
} |
Greedy Quantifiers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.ocjp.regex; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class RegexGreedy { | |
public static void main(String[] args) { | |
Pattern p = Pattern.compile(".*xx"); // Greedy * | |
Matcher m = p.matcher("yyxxxyxx"); | |
while ( m.find() ) { | |
System.out.println( m.start() + " " + m.group()); | |
} | |
System.out.println("#########################"); | |
Pattern p1 = Pattern.compile(".*?xx"); // Reluctant quantifier | |
Matcher m1 = p1.matcher("yyxxxyxx"); | |
while ( m1.find() ) { | |
System.out.println( m1.start() + " " + m1.group()); | |
} | |
} | |
} |
Tokenizing
Tokenizing is the process of taking big pieces of source data, breaking them into
little pieces, and storing the little pieces in variables
Tokenizing with Scanner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.ocjp.regex; | |
import java.util.Scanner; | |
public class AppScanner { | |
public static void main(String [] args) { | |
boolean b2, b; | |
int i; | |
String s, hits = " "; | |
Scanner s1 = new Scanner(args[0]); | |
Scanner s2 = new Scanner(args[0]); | |
while(b = s1.hasNext()) { | |
s = s1.next(); hits += "s"; | |
} | |
while(b = s2.hasNext()) { | |
if (s2.hasNextInt()) { | |
i = s2.nextInt(); hits += "i"; | |
} else if (s2.hasNextBoolean()) { | |
b2 = s2.nextBoolean(); hits += "b"; | |
} else { | |
s2.next(); hits += "s2"; | |
} | |
} | |
System.out.println("hits " + hits); | |
} | |
} |
Formatting with printf() and format()
Both methods have exactly the same behaviour which means anything we say about one of these methods is applicable to both.
Let's see how formatting works:
%[arg_index$][flags][width][.precision]conversion char
The values within [ ] are optional.
1. arg_index - An integer followed directly by a $, this indicates which argument should be printed in this position.
2. flags - While many flags are available, for the exam you'll need to know:
¦ "-" Left justify this argument
¦ "+" Include a sign (+ or -) with this argument
¦ "0" Pad this argument with zeroes
¦ "," Use locale-specific grouping separators (i.e., the comma in 123,456)
¦ "(" Enclose negative numbers in parentheses
3. width - This value indicates the minimum number of characters to print. (If you
want nice even columns, you'll use this value extensively.)
4. precision - For the exam you'll only need this when formatting a floating-point
number, and in the case of floating point numbers, precision indicates the number of
digits to print after the decimal point.
5. conversion - The type of argument you'll be formatting. You'll need to know:
¦ b boolean
¦ c char
¦ d integer
¦ f floating point
¦ s string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.ocjp.regex; | |
public class AppPrintf { | |
public static void main(String[] args) { | |
boolean b = true; | |
double num = 565.99; | |
long num1 = 6565415; | |
System.out.printf("%1$010d \n",num1); | |
System.out.printf("%1$,.5f \n",num); | |
System.out.printf("%1$010d , %2$,.5f \n ",num1,num); | |
System.out.printf("---> %1$b",b); | |
} | |
} |