// AnalyzeHtml.java ... Analyze HTML elements of a URL via drilling down from a HTML elent type/class combination // RJM Programming 16/11/2013 import java.util.ArrayList; import java.util.Iterator; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import java.util.Scanner; import java.io.IOException; import java.net.URL; import java.util.List; public class AnalyzeHtml { TagNode ourRootNode; public AnalyzeHtml(URL ourHtmlPage) throws IOException { HtmlCleaner ourCleaner = new HtmlCleaner(); ourRootNode = ourCleaner.clean(ourHtmlPage); } List getTagsByClass(String passedClassname, String ourByName) { List ourTagList = new ArrayList(); TagNode ourTagElements[] = ourRootNode.getElementsByName(ourByName, true); for (int iit = 0; ourTagElements != null && iit < ourTagElements.length; iit++) { String ourClassType = ourTagElements[iit].getAttributeByName("class"); if (ourClassType != null && ourClassType.equals(passedClassname)) { ourTagList.add(ourTagElements[iit]); } } return ourTagList; } public static void main(String[] args) { String ourElementType = ""; //"select"; String ourClassName = ""; //"mytuts"; String ourUrl = ""; //"http://www.rjmprogramming.com.au"; try { ourElementType = args[0]; ourClassName = args[1]; ourUrl = args[2]; } catch (Exception err) { } Scanner in = new Scanner(System.in); if (ourElementType == "") { System.out.print("Please enter HTML element type to look for [div]: "); ourElementType = in.nextLine().replace('\n', '\0'); if (ourElementType.compareTo(" ") <= 0) ourElementType = "select"; } if (ourClassName == "") { System.out.print("Please enter HTML element class to look for [Normal]: "); ourClassName = in.nextLine().replace('\n', '\0'); if (ourClassName.compareTo(" ") <= 0) ourClassName = "mytuts"; } if (ourUrl == "") { System.out.print("Please enter HTML url to search on [http://www.rjmprogramming.com.au]: "); ourUrl = in.nextLine().replace('\n', '\0'); if (ourUrl.compareTo(" ") <= 0) ourUrl = "http://www.rjmprogramming.com.au"; } try { AnalyzeHtml thisAnalyze = new AnalyzeHtml(new URL(ourUrl)); List ourElements = thisAnalyze.getTagsByClass(ourClassName, ourElementType); System.out.println("Data of " + ourElementType + "s with class='" + ourClassName + "' at '" + ourUrl + "'"); for (Iterator ourIterator = ourElements.iterator(); ourIterator.hasNext();) { TagNode ourElement = (TagNode) ourIterator.next(); System.out.println("Text child nodes of " + ourElementType + "s: " + ourElement.getText().toString()); } } catch (Exception err) { err.printStackTrace(); } } }