Java Rumblings: parsing

Showing posts with label parsing. Show all posts

Wednesday, 22 June 2011

RSS Parser (SAX)

RSS (Really Simple Syndication)
RSS is way to publish frequently changing contents like blog posts, news updates, stock quotes & things like that. An RSS document, which is called a “feed,” “web feed,” or “channel,” contains either a summary of content from an associated web site or the full text. RSS formats are specified using XML, a generic specification for the creation of data formats.
I have attached a simple SAX parser for RSS. Please let me know if there is any flaw in the attached code. This code is provided for learning purpose with less focus on coding standards & it’s efficiency. You are free to use & modify it.

Code
RssParser.java

package com.vaani.rss.parser;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Properties;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;

public class RssParser extends DefaultHandler
{  
    private String        urlString;
    private RssFeed       rssFeed;
    private StringBuilder text;
    private Item          item;
    private boolean       imgStatus;
   
    public RssParser(String url)
    {
        this.urlString = url;
        this.text = new StringBuilder();
    }
   
    public void parse()
    {
        InputStream urlInputStream = null;
        SAXParserFactory spf = null;
        SAXParser sp = null;
       
        try
        {
            URL url = new URL(this.urlString);
            _setProxy(); // Set the proxy if needed
            urlInputStream = url.openConnection().getInputStream();           
            spf = SAXParserFactory.newInstance();
            if (spf != null)
            {
                sp = spf.newSAXParser();
                sp.parse(urlInputStream, this);
            }
        }

        /*
         * Exceptions need to be handled
         * MalformedURLException
         * ParserConfigurationException
         * IOException
         * SAXException
         */
       
        catch (Exception e)
        {
            System.out.println("Exception: " + e);
            e.printStackTrace();
        }
        finally
        {
            try
            {
                if (urlInputStream != null) urlInputStream.close();
            }
            catch (Exception e) {}
        }
    }

    public RssFeed getFeed()
    {
        return (this.rssFeed);
    }
   
    public void startElement(String uri, String localName, String qName,
            Attributes attributes)
    {
        if (qName.equalsIgnoreCase("channel"))
            this.rssFeed = new RssFeed();
        else if (qName.equalsIgnoreCase("item") && (this.rssFeed != null))
        {
            this.item = new Item();
            this.rssFeed.addItem(this.item);
        }
        else if (qName.equalsIgnoreCase("image") && (this.rssFeed != null))
            this.imgStatus = true;
    }
   
    public void endElement(String uri, String localName, String qName)
    {
        if (this.rssFeed == null)
            return;
       
        if (qName.equalsIgnoreCase("item"))
            this.item = null;
       
        else if (qName.equalsIgnoreCase("image"))
            this.imgStatus = false;
       
        else if (qName.equalsIgnoreCase("title"))
        {
            if (this.item != null) this.item.title = this.text.toString().trim();
            else if (this.imgStatus) this.rssFeed.imageTitle = this.text.toString().trim();
            else this.rssFeed.title = this.text.toString().trim();
        }       
       
        else if (qName.equalsIgnoreCase("link"))
        {
            if (this.item != null) this.item.link = this.text.toString().trim();
            else if (this.imgStatus) this.rssFeed.imageLink = this.text.toString().trim();
            else this.rssFeed.link = this.text.toString().trim();
        }       
       
        else if (qName.equalsIgnoreCase("description"))
        {
            if (this.item != null) this.item.description = this.text.toString().trim();
            else this.rssFeed.description = this.text.toString().trim();
        }
       
        else if (qName.equalsIgnoreCase("url") && this.imgStatus)
            this.rssFeed.imageUrl = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("language"))
            this.rssFeed.language = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("generator"))
            this.rssFeed.generator = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("copyright"))
            this.rssFeed.copyright = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("pubDate") && (this.item != null))
            this.item.pubDate = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("category") && (this.item != null))
            this.rssFeed.addItem(this.text.toString().trim(), this.item);
       
        this.text.setLength(0);
    }
   
    public void characters(char[] ch, int start, int length)
    {
        this.text.append(ch, start, length);
    }
   
    public static void _setProxy()
    throws IOException
    {
        Properties sysProperties = System.getProperties();
        sysProperties.put("proxyHost", "<Proxy IP Address>");
        sysProperties.put("proxyPort", "<Proxy Port Number>");
        System.setProperties(sysProperties);
    }
  
    public static class RssFeed
    {
        public  String title;
        public  String description;
        public  String link;
        public  String language;
        public  String generator;
        public  String copyright;
        public  String imageUrl;
        public  String imageTitle;
        public  String imageLink;
       
        public ArrayList <Item> items;
        public HashMap <String, ArrayList <Item>> category;
       
        public void addItem(Item item)
        {
            if (this.items == null)
                this.items = new ArrayList<Item>();
            this.items.add(item);
        }
       
        public void addItem(String category, Item item)
        {
            if (this.category == null)
                this.category = new HashMap<String, ArrayList<Item>>();
            if (!this.category.containsKey(category))
                this.category.put(category, new ArrayList<Item>());
            this.category.get(category).add(item);
        }
    }
   
   
   
}

Item.java

package com.vaani.rss.parser;

public  class Item
{
    public  String title;
    public  String description;
    public  String link;
    public  String pubDate;
   
    public String toString()
    {
        return (this.title + ": " + 
             this.pubDate + "n" + this.description);
    }
}

RssParserDemo.java - Ready with demo

package com.vaani.rss.main;

import java.util.ArrayList;

import com.vaani.rss.parser.RssParser;
import com.vaani.rss.parser.RssParser.RssFeed;
import com.vaani.rss.parser.Item;



public class RssParserDemo {

    public static void main(String[] args){
        RssParser rp = new RssParser("<some rss feed>");
        rp.parse();
        RssFeed feed = rp.getFeed();

        // Listing all categories & the no. of elements in each category
        if (feed.category != null)
        {
         System.out.println("Category List: ");
         for (String category : feed.category.keySet())
         {
          System.out.println(category
            + ": "
            + ((ArrayList<Item>)feed.category.get(category)).size());
         }
        }

        // Listing all items in the feed
        for (int i = 0; i < feed.items.size(); i++)
         System.out.println(feed.items.get(i).title); 
    }
}

Checking whether string is parseable to integer or double

This seems basic, right? In most cases it is, but as almost everything in Java this problem has its subtle pitfalls and problems. It is mainly because Java does not provide a simple utility method that can answer this question. Today I wanted to share with you several ways of solving this problem and describe their good and bad sides.

Why should you care?

Checking for that in many cases is unnecessary. If the format of data is defined and its contract states that the string is an integer you can just parse it and deal with unlikely exception that an error occurs. The problem is when there is no such a contract and you have to decide based on whether the string is an integer what actions to perform next. In that case plain try-catch check may be too expensive for you:

public boolean isInteger(String string) {
    try {
        Integer.valueOf(string);
        return true;
    } catch (NumberFormatException e) {
        return false;
    }
}

This method’s execution cost is high because of two factors: one is that to determine if string is an integer we have to do the whole parsing and throw away the result. Second is that we use exception throwing (which is expensive) to direct the program flow. The good thing about this code is its simplicity – you can at a glance say the method is correct.

Let’s use RegExp!

Much faster is to create a regular expression and use it to check whether string contains an integer or double. The good thing about this approach is that the regexp can be precompiled and used several times after:

private static Pattern doublePattern = Pattern.compile("-?\\d+(\\.\\d*)?");

public boolean isDouble(String string) {
    return doublePattern.matcher(string).matches();
}

Unfortunately this method has important flaws: the pattern above will work for the most basic string representation of Double, but what about more advanced like “1.23E-12″. Even if you improve this pattern (belive me, its difficult) there are still some checks that it will not be able to perform, for instance checking if the integer is above Integer.MAX_INT.

What about Scanner?

There is a way of combining the two approaches shown above together: first check with regexp if string is possibly be an integer and if it seems to be one, try to perform the actual parsing. If the regexp is ‘good enough’ the number of false positives resulting in NumberFormatException will be acceptable. The good news is this approach is already implemented by a Scanner class. See the following example:

public static void main(String[] args) {
    Scanner scanner = new Scanner("Test string: 12.3 dog 12345 cat 1.2E-3");

    while (scanner.hasNext()) {
        if (scanner.hasNextDouble()) {
            Double doubleValue = scanner.nextDouble();
        } else {
            String stringValue = scanner.next();
        }
    }
}

In essence Scanner breaks down the given string into tokens around whitespace and allows you to iterate trough them. It gives you useful access methods like ‘hasNextDouble()’ to check whether the next token is a Double or not and allows you to get it in a parsed version as a Double with ‘nextDouble()’ method.

Internals of Scanner show that it in fact combines both the regexp and exception catching methods, which makes it quite efficient. The downside is that the Scanner object itself is heavy and prepared to parsing larger text strings, so it may be ineffective if you need to use it on a simple strings like “123″.

Wait! It does not work for me!!

It is possible that you start using one of the methods above on a real life data and at some point things stop making sense… Why? Because we forgot about something important: the numbers are locale-sensitive and its string representation depends from country to country. For instance ten thousand in US is 10,000, in Poland 10 000 and in Italy 10.000. See that none of the methods above could successfully parse neither Polish or Italian numbers! What can you do in those cases? You have to use for parsing a NumberFormat class with specified locale:

private static NumberFormat italianDouble =
        NumberFormat.getNumberInstance(Locale.ITALIAN);

public boolean isItalianDouble(String string) {
    return (italianDouble.parse(string) != null);
}

Now you can finally see that 10,000 is a valid integer. Unfortunately with NumberFormat you get another set of problems – it is too liberal in parsing numbers! The method above will return true for 10,000 and false for both abc and x1, but it will return true also for 10abc as it looks only for a suffix in the string, not a total match.

Conclusion

As you can see none of the solutions shown above is perfect – each of the method aboves has its flaws and advantages. Because of that the choice which one is the best for you strongly depends on the context of your program. The important factors are: how often do you need to do a check like that, what is the false result ratio, whether you parse long human readable text or just few given values and whether you care about locale specific issues. It is also possible that in your code you’ll need a combination of them or to add some specific tweaks to one of them.

Tuesday, 21 June 2011

Java Ini Parser

There is a open source project named [ini4j] for processing Windows .ini configuration files. However, I found it an overkill for my purposes. So here is my simple implementation of a .ini parser. It mimics the standard java.util.Properties class with enhancements to get and set properties by section name.

There are only a few simple rules:

Leading and trailing spaces are trimmed from section names, property names and property values.
Section names are enclosed between [ and ].
Properties following a section header belong to that section
Properties defined before the appearance of any section headers are considered global properties and should be set and get with no section names.
You can use either equal sign (=) or colon (:) to assign property values
Comments begin with either a semicolon (;), or a sharp sign (#) and extend to the end of line. It doesn't have to be the first character.
A backslash (\) escapes the next character (e.g., \# is a literal #, \\ is a literal \).
If the last character of a line is backslash (\), the value is continued on the next line with new line character included.

Example Code

import java.util.*;
import java.io.*;

public class IniProperties {
    private Properties globalProperties;
    private Map<String,Properties> properties;

    enum ParseState {
        NORMAL,
        ESCAPE,
        ESC_CRNL,
        COMMENT
    }

    public IniProperties() {
        globalProperties = new Properties();
        properties = new HashMap<String,Properties>();
    }

    /**
     * Load ini as properties from input stream.
     */
    public void load(InputStream in) throws IOException {
        int bufSize = 4096;
        byte[] buffer = new byte[bufSize];
        int n = in.read(buffer, 0, bufSize);

        ParseState state = ParseState.NORMAL;
        boolean section_open = false;
        String current_section = null;
        String key = null, value = null;
        StringBuilder sb = new StringBuilder();
        while (n >= 0) {
            for (int i = 0; i < n; i++) {
                char c = (char) buffer[i];

                if (state == ParseState.COMMENT) { 
      // comment, skip to end of line
                    if ((c == '\r') ||(c == '\n')) {
                        state = ParseState.NORMAL;
                    }
                    else {
                        continue;
                    }
                }

                if (state == ParseState.ESCAPE) {
                    sb.append(c);
                    if (c == '\r') {
                        // if the EOL is \r\n, \ escapes both chars
                        state = ParseState.ESC_CRNL; 
                    }
                    else {
                        state = ParseState.NORMAL;
                    }
                    continue;
                }

                switch (c) {
                    case '[': // start section
                        sb = new StringBuilder();
                        section_open = true;
                        break;
                    
                    case ']': // end section
                        if (section_open) {
                            current_section = sb.toString().trim();
                            sb = new StringBuilder();
                            properties.put(current_section, new Properties());
                            section_open = false;
                        }
                        else {
                            sb.append(c);
                        }
                        break;

                    case '\\': // escape char, take the next char as is
                        state = ParseState.ESCAPE;
                        break;

                    case '#': 
                    case ';': 
                        state = ParseState.COMMENT;
                        break;

                    case '=': // assignment operator
                    case ':':
                        if (key == null) {
                            key = sb.toString().trim();
                            sb = new StringBuilder();
                        }
                        else {
                            sb.append(c);
                        }
                        break;

                    case '\r':
                    case '\n':
                        if ((state == ParseState.ESC_CRNL) && (c == '\n')) {
                            sb.append(c);
                            state = ParseState.NORMAL;
                        }
                        else {
                            if (sb.length() > 0) {
                                value = sb.toString().trim();
                                sb = new StringBuilder();
                        
                                if (key != null) {
                                    if (current_section == null) {
                                        this.setProperty(key, value);
                                    }
                                    else {
                                        this.setProperty(current_section, key, value);
                                    }
                                }
                            }
                            key = null;
                            value = null;
                        }
                        break;

                    default: 
                        sb.append(c);
                }
            }
            n = in.read(buffer, 0, bufSize);
        }
    }

    /**
     * Get global property by name.
     */
    public String getProperty(String name) {
        return globalProperties.getProperty(name);
    }

    /**
     * Set global property.
     */
    public void setProperty(String name, String value) {
        globalProperties.setProperty(name, value);
    }

    /**
     * Return iterator of global properties.
     */
    @SuppressWarnings("unchecked")
    public Iterator<String> properties() {
        return new IteratorFromEnumeration<String>(
                   (Enumeration<String>)globalProperties.propertyNames());
    }

    /**
     * Get property value for specified section and name. Returns null
     * if section or property does not exist.
     */
    public String getProperty(String section, String name) {
        Properties p = properties.get(section);
        return p == null ? null : p.getProperty(name);
    }

    /**
     * Set property value for specified section and name. Creates section
     * if not existing.
     */
    public void setProperty(String section, String name, String value) {
        Properties p = properties.get(section);
        if (p == null) {
            p = new Properties();
            properties.put(section, p);
        }
        p.setProperty(name, value);
    }

    /**
     * Return property iterator for specified section. Returns null if
     * specified section does not exist.
     */
    @SuppressWarnings("unchecked")
    public Iterator<String> properties(String section) {
        Properties p = properties.get(section);
        if (p == null) {
            return null;
        }
        return new IteratorFromEnumeration<String>(
                   (Enumeration<String>)p.propertyNames());
    }

    /**
     * Return iterator of names of section.
     */
    public Iterator<String> sections() {
        return properties.keySet().iterator();
    }

    /**
     * Dumps properties to output stream.
     */
    public void dump(PrintStream out) throws IOException {
        // Global properties
        Iterator<String> props = this.properties();
        while (props.hasNext()) {
            String name = props.next();
            out.printf("%s = %s\n", name, dumpEscape(getProperty(name)));
        }

        // sections
        Iterator<String> sections = this.sections();
        while (sections.hasNext()) {
            String section = sections.next();
            out.printf("\n[%s]\n", section);
            props = this.properties(section);
            while (props.hasNext()) {
                String name = props.next();
                out.printf("%s = %s\n", name, dumpEscape(getProperty(section, name)));
            }
        }
    }

    private static String dumpEscape(String s) {
        return s.replaceAll("\\\\", "\\\\\\\\")
                .replaceAll(";", "\\\\;")
                .replaceAll("#", "\\\\#")
                .replaceAll("(\r?\n|\r)", "\\\\$1");
    }

    // private class used to coerce Enumerator to Iterator.
    private static class IteratorFromEnumeration<E> implements Iterator {
        private Enumeration<E> e;

        public IteratorFromEnumeration(Enumeration<E> e) {
            this.e = e;
        }

        public boolean hasNext() {
            return e.hasMoreElements();
        }

        public E next() {
            return e.nextElement();
        }

        public void remove() {
            throw new UnsupportedOperationException("Can't  
                                       change underlying enumeration");
        }
    }

    public static void main(String[] args) throws IOException {
        IniProperties props = new IniProperties();
        InputStream in = new BufferedInputStream(new FileInputStream("test.ini"));
        props.load(in);
        in.close();
        
        props.dump(System.out);
    }
}

The test file
Following ini4j, here is a test file:

; Global properties
story = Snow White and the Seven Dwarfs
year = 1937
url = www.imdb.com/title/tt0029583/
; A backslash at the end of the line escapes the new line
; character, the property value continues to the next line.
; Since the ; character (after Dwarfs) starts a comment 
; (either at the beginning or middle of the line, we have 
; to escape it with a backslash.
plot = Snow White, pursued by a jealous queen, hides with the Dwarfs\; \
the queen feeds her a poison apple, but Prince Charming \
awakens her with a kiss. # this is a comment

# This is also a comment line
# The first : can also be used as assignment operator
Tagline: Walt Disney's New characters in his first full-length production!
file = C:\\local\\snowwhite.mpg

; Bashful
[bashful]
weight = 45.7
height = 98.8
age = 67
homePage = http://snowwhite.tale/~bashful

; Doc
[doc]
weight = 49.5
height = 87.7
age = 63
homePage = http://doc.dwarfs

Pages

Wednesday, 22 June 2011

RSS Parser (SAX)

Checking whether string is parseable to integer or double

Tuesday, 21 June 2011

Java Ini Parser