Normalization, sorting and searching


Normalization


Java Normalizer class


String equality


Collator equality

Collator normalisation

Strength


String comparison


String comparison in German


String comparison in French


Ordering in Chinese


Collator comparison


import java.text.Collator;
import java.util.Locale;

public class Compare1 {
    static Collator collate = Collator.getInstance();

    public static void main(String[] args) {
	collate.setDecomposition(Collator.CANONICAL_DECOMPOSITION);

	System.out.println("Locale is " + Locale.getDefault().toString());

	System.out.println("Default strength is TERTIARY");
	compare("aaa", "bbb");
	compare("Abc", "abc");
	compare("Abc", "bbc");
	compare("\u00c0bc", "abc");

	collate.setStrength(Collator.SECONDARY);
	System.out.println("\nStrength is SECONDARY");
	compare("Abc", "abc");
	compare("\u00c0bc", "Abc");
	compare("\u00c0bc", "\u00c1bc");
	compare("Abc", "\u00c1bc");

	collate.setStrength(Collator.PRIMARY);
	System.out.println("\nStrength is PRIMARY");
	compare("Abc", "abc");
	compare("\u00c0bc", "abc");
    }
    
    static void compare(String s1, String s2) {
	int comp = collate.compare(s1, s2);
	if (comp == 0) {
	    print("equals", s1, s2);
	} else if (comp < 0) {
	    print("is before", s1, s2);
	} else {
	    print("is after", s1, s2);
	}
    }
	
    static void print(String state, String s1, String s2) {
	System.out.println("\"" + s1 +"\" " + state + " \"" +s2);
    }
}


Making your own rules

You can make your own rules for RuleBasedCollator


import java.text.Collator;
import java.text.RuleBasedCollator;
import java.util.Locale;

public class Compare2 {
    static String rule = "< a < c < b";
    static RuleBasedCollator collate
;
    public static void main(String[] args) throws java.text.ParseException {
	collate = new RuleBasedCollator(rule);

	System.out.println("Locale is " + Locale.getDefault().toString());

	System.out.println("Default strength is TERTIARY");
	compare("aaa", "bbb");
	compare("bbb", "ccc");
    }
    
    static void compare(String s1, String s2) {
	int comp = collate.compare(s1, s2);
	if (comp == 0) {
	    print("equals", s1, s2);
	} else if (comp < 0) {
	    print("is before", s1, s2);
	} else {
	    print("is after", s1, s2);
	}
    }
	
    static void print(String state, String s1, String s2) {
	System.out.println("\"" + s1 +"\" " + state + " \"" +s2);
    }
}

Sorting


import java.text.Collator;
import java.util.Locale;
import java.util.Vector;
import java.util.Collections;

public class Sort {
    static Collator collate = Collator.getInstance();

    public static void main(String[] args) {
	collate.setDecomposition(Collator.CANONICAL_DECOMPOSITION);

	Vector list = new Vector();
	list.add("abc");
	list.add("aaa");
	list.add("aab");

	Collections.sort(list, collate);

	for (int n = 0; n < list.size(); n++) {
	    System.out.println(list.elementAt(n));
	}

    }
}


Text boundaries


BreakIterator class


import java.text.BreakIterator;

public class WordBreak {

    public static void main(String[] args) {
	String str ="An string, with!???! and others";
	System.out.println(str);
	BreakIterator iterator = BreakIterator.getWordInstance();
	iterator.setText(str);
	int start;
	int end;
	start = end = iterator.first();
	while (true) {
	    System.out.print("Boundary at " + end);
	    String word = str.substring(start, end);
	    System.out.println(", word is \"" + word + "\"");
	    if (end == BreakIterator.DONE) {
		break;
	    }
	    start = end;
	    end = iterator.next();
	}
	
    }
    
}


Word break in Chinese


Regular expressions



import java.util.regex.*;

public class Regex {

    public static void main(String[] args) {
	Pattern p = Pattern.compile("A*b");
	Matcher m = p.matcher("aaaaab");
	boolean b = m.matches();
	if (b) {
	    System.out.println("Matched");
	} else {
	    
	    System.out.println("Didn't match");
	}

	m = p.matcher("\u00c0b");
	b = m.matches();
	if (b) {
	    System.out.println("Matched");
	} else {
	    
	    System.out.println("Didn't match");
	}

    }
    
}



Jan Newmarch <jan@newmarch.name>
Last modified: Mon Aug 28 11:25:09 EST 2006
Copyright © Jan Newmarch, Monash University, 2007
Creative Commons License This work is licensed under a Creative Commons License
The moral right of Jan Newmarch to be identified as the author of this page has been asserted.