/*
 * @(#)CompareURLContent.java 0.0.1 97/12/10
 *
 *  Copyright (c) 1997  Michael J. Radwin.
 *  All rights reserved.
 * 
 *  Redistribution and use in source and binary forms, with or
 *  without modification, are permitted provided that the following
 *  conditions are met:
 * 
 *   * Redistributions of source code must retain the above
 *     copyright notice, this list of conditions and the following
 *     disclaimer.
 * 
 *   * Redistributions in binary form must reproduce the above
 *     copyright notice, this list of conditions and the following
 *     disclaimer in the documentation and/or other materials
 *     provided with the distribution.
 * 
 *   * Neither the name of Radwin.org nor the names of its
 *     contributors may be used to endorse or promote products
 *     derived from this software without specific prior written
 *     permission.
 * 
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 *  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 *  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

/** 
 * A self-serializing Java class that compares a hashed version of the
 * content stream associated with a URL with the current version.
 *
 * @author <A HREF="http://www.radwin.org/michael/">Michael J. Radwin</A> 
 */
public class CompareURLContent implements java.io.Serializable
{
    protected java.util.Hashtable tab;
    
    public CompareURLContent()
    {
	tab = new java.util.Hashtable();
    }

    public URLHash getHashIfNewer(String u)
	throws java.net.MalformedURLException, java.io.IOException
    {
	return getHashIfNewer(new java.net.URL(u));
    }
    
    /**
     * returns null if this URL is identical to the stored version, or a
     * URLHash object if it differs.  If it is new or not seen before,
     * it updates the stored state.
     */
    public URLHash getHashIfNewer(java.net.URL u)
	throws java.io.IOException
    {
        java.net.URLConnection c = u.openConnection();

        if (!(c instanceof java.net.HttpURLConnection)) {
            throw new IllegalArgumentException("should be an http URL: "+u);
        }

	java.security.MessageDigest sha;
	try {
	    sha = java.security.MessageDigest.getInstance("SHA");
	} catch (java.security.NoSuchAlgorithmException e) {
	    throw new InternalError("SHA algorithm doesn't exist?!");
	}

	java.security.DigestInputStream dis =
	    new java.security.DigestInputStream(c.getInputStream(), sha);

	/* throw the bits on the floor until the content is finished */
	byte b[] = new byte[1024];
	while (dis.read(b) > 0);
	
	String url = c.getURL().toExternalForm();
	Object o = tab.get(url);

	URLHash hash = new URLHash(sha.digest(), url, new java.util.Date());

	if (o == null) {
	    tab.put(url, hash);
	    return hash;
	} else if (o instanceof URLHash) {
	    if (hash.equals(o)) {
		return null;
	    } else {
		tab.put(url, hash);
		return hash;
	    }
	} else {
	    throw new InternalError("HashTable has wrong object type!");
	}
    }
    
    /**
     * compares every line in filename and returns a vector of strings
     * representing the URLs that are different.
     */
    public synchronized java.util.Vector compare(java.io.File filename)
	throws java.io.IOException
    {
	java.io.BufferedReader reader = 
	    new java.io.BufferedReader(new java.io.FileReader(filename));
	java.util.Vector v = new java.util.Vector();
	
	for (;;) {
	    String s = reader.readLine();
	    if (s == null) 
		break;
	    if (s.equals(""))
		continue;
	    debugln("comparing " + s);
	    if (getHashIfNewer(s) != null)
		v.addElement(s);
	}

	return v;
    }

    protected static boolean debugging = false;
    protected static void debug(String s)
    {
	if (debugging)
	    System.err.print(s);
    }
    protected static void debugln(String s)
    {
	if (debugging)
	    System.err.println(s);
    }

    public synchronized boolean save(java.io.File filename)
    {
	java.io.FileOutputStream os;
	java.io.ObjectOutputStream oos;

	debugln(pname+": saving state...");
	try {
	    os = new java.io.FileOutputStream(filename);
	    oos = new java.io.ObjectOutputStream(os);
	} catch (java.io.IOException e) {
	    debugln(pname+": can't write to " +
		    filename + ": " + e.getMessage());
	    return false;
	}

	try {
	    oos.writeObject(this);
	} catch (java.io.IOException e) {
	    debugln(pname+": error writing to "+filename);
	    return false;
	}

	debugln(pname+": done.");
	return true;
    }

    public static CompareURLContent restore(java.io.File filename)
    {
	java.io.FileInputStream is;
	java.io.ObjectInputStream ois;
 
	debugln(pname+": restoring...");
	try {
	    is = new java.io.FileInputStream(filename);
	    ois = new java.io.ObjectInputStream(is);
	} catch (java.io.IOException e) {
	    debugln(pname+": can't read " + filename);
	    return null;
	}
	
	try {
	    CompareURLContent cuc = (CompareURLContent) ois.readObject();
	    debugln(pname+": successfully restored.");
	    return cuc;
	} catch (Exception e) {
	    debugln(pname+": invalid file " + filename);
	    return null;
	}
    }
    
    private static final String pname = "CompareURLContent";
    private static final String usage = "usage: "+pname+" urls.txt";
    private static final String sername = pname + ".ser";

    public static void main(String args[])
    {
	// ensure option flag
	if (args.length < 1) {
	    System.err.println(usage);
	    return;
	} 

	CompareURLContent cuc = restore(new java.io.File(sername));

	if (cuc == null)
	    cuc = new CompareURLContent();

	try {
	    java.util.Vector v = cuc.compare(new java.io.File(args[0]));
	    for (java.util.Enumeration e = v.elements();
		 e.hasMoreElements(); )
		System.out.println(e.nextElement());

	} catch (java.io.IOException e) {
	    debugln(pname+": error comparing with "+args[0]);
	    debugln(e.toString());
	    return;
	}

	cuc.save(new java.io.File(sername));
    }
}

class URLHash implements java.io.Serializable
{
    public byte b[];
    public String url;
    public java.util.Date d;

    public URLHash(byte b[], String url, java.util.Date d)
    {
	this.b = b;
	this.url = url;
	this.d = d;
    }

    public int hashCode()
    {
	int h = 0;
	int off = 0;
	int len = b.length;

	if (len < 16) {
 	    for (int i = len ; i > 0; i--) {
 		h = (h * 37) + b[off++];
 	    }
 	} else {
 	    // only sample some characters
 	    int skip = len / 8;
 	    for (int i = len ; i > 0; i -= skip, off += skip) {
 		h = (h * 39) + b[off];
 	    }
 	}

	return h;

    }
    
    public boolean equals(Object other)
    {
	if ((other != null) && other instanceof URLHash) {
	    byte b2[] = ((URLHash) other).b;
	    if (b.length != b2.length)
		return false;
	    
	    for (int i = 0; i < b.length; i++) {
		if (b[i] != b2[i]) {
		    return false;
		}
	    }
	    return true;
	} else {
	    return false;
	}
    }
}
