/* * @(#)CompareURLContent.java 0.0.1 97/12/10 * * Copyright (c) 1997 Michael J. Radwin. * All rights reserved. * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * * Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * * Neither the name of Radwin.org nor the names of its * contributors may be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ /** * A self-serializing Java class that compares a hashed version of the * content stream associated with a URL with the current version. * * @author Michael J. Radwin */ public class CompareURLContent implements java.io.Serializable { protected java.util.Hashtable tab; public CompareURLContent() { tab = new java.util.Hashtable(); } public URLHash getHashIfNewer(String u) throws java.net.MalformedURLException, java.io.IOException { return getHashIfNewer(new java.net.URL(u)); } /** * returns null if this URL is identical to the stored version, or a * URLHash object if it differs. If it is new or not seen before, * it updates the stored state. */ public URLHash getHashIfNewer(java.net.URL u) throws java.io.IOException { java.net.URLConnection c = u.openConnection(); if (!(c instanceof java.net.HttpURLConnection)) { throw new IllegalArgumentException("should be an http URL: "+u); } java.security.MessageDigest sha; try { sha = java.security.MessageDigest.getInstance("SHA"); } catch (java.security.NoSuchAlgorithmException e) { throw new InternalError("SHA algorithm doesn't exist?!"); } java.security.DigestInputStream dis = new java.security.DigestInputStream(c.getInputStream(), sha); /* throw the bits on the floor until the content is finished */ byte b[] = new byte[1024]; while (dis.read(b) > 0); String url = c.getURL().toExternalForm(); Object o = tab.get(url); URLHash hash = new URLHash(sha.digest(), url, new java.util.Date()); if (o == null) { tab.put(url, hash); return hash; } else if (o instanceof URLHash) { if (hash.equals(o)) { return null; } else { tab.put(url, hash); return hash; } } else { throw new InternalError("HashTable has wrong object type!"); } } /** * compares every line in filename and returns a vector of strings * representing the URLs that are different. */ public synchronized java.util.Vector compare(java.io.File filename) throws java.io.IOException { java.io.BufferedReader reader = new java.io.BufferedReader(new java.io.FileReader(filename)); java.util.Vector v = new java.util.Vector(); for (;;) { String s = reader.readLine(); if (s == null) break; if (s.equals("")) continue; debugln("comparing " + s); if (getHashIfNewer(s) != null) v.addElement(s); } return v; } protected static boolean debugging = false; protected static void debug(String s) { if (debugging) System.err.print(s); } protected static void debugln(String s) { if (debugging) System.err.println(s); } public synchronized boolean save(java.io.File filename) { java.io.FileOutputStream os; java.io.ObjectOutputStream oos; debugln(pname+": saving state..."); try { os = new java.io.FileOutputStream(filename); oos = new java.io.ObjectOutputStream(os); } catch (java.io.IOException e) { debugln(pname+": can't write to " + filename + ": " + e.getMessage()); return false; } try { oos.writeObject(this); } catch (java.io.IOException e) { debugln(pname+": error writing to "+filename); return false; } debugln(pname+": done."); return true; } public static CompareURLContent restore(java.io.File filename) { java.io.FileInputStream is; java.io.ObjectInputStream ois; debugln(pname+": restoring..."); try { is = new java.io.FileInputStream(filename); ois = new java.io.ObjectInputStream(is); } catch (java.io.IOException e) { debugln(pname+": can't read " + filename); return null; } try { CompareURLContent cuc = (CompareURLContent) ois.readObject(); debugln(pname+": successfully restored."); return cuc; } catch (Exception e) { debugln(pname+": invalid file " + filename); return null; } } private static final String pname = "CompareURLContent"; private static final String usage = "usage: "+pname+" urls.txt"; private static final String sername = pname + ".ser"; public static void main(String args[]) { // ensure option flag if (args.length < 1) { System.err.println(usage); return; } CompareURLContent cuc = restore(new java.io.File(sername)); if (cuc == null) cuc = new CompareURLContent(); try { java.util.Vector v = cuc.compare(new java.io.File(args[0])); for (java.util.Enumeration e = v.elements(); e.hasMoreElements(); ) System.out.println(e.nextElement()); } catch (java.io.IOException e) { debugln(pname+": error comparing with "+args[0]); debugln(e.toString()); return; } cuc.save(new java.io.File(sername)); } } class URLHash implements java.io.Serializable { public byte b[]; public String url; public java.util.Date d; public URLHash(byte b[], String url, java.util.Date d) { this.b = b; this.url = url; this.d = d; } public int hashCode() { int h = 0; int off = 0; int len = b.length; if (len < 16) { for (int i = len ; i > 0; i--) { h = (h * 37) + b[off++]; } } else { // only sample some characters int skip = len / 8; for (int i = len ; i > 0; i -= skip, off += skip) { h = (h * 39) + b[off]; } } return h; } public boolean equals(Object other) { if ((other != null) && other instanceof URLHash) { byte b2[] = ((URLHash) other).b; if (b.length != b2.length) return false; for (int i = 0; i < b.length; i++) { if (b[i] != b2[i]) { return false; } } return true; } else { return false; } } }