// AnalyzeHtml.java ... Analyze HTML elements of a URL via drilling down from a HTML elent type/class combination
// RJM Programming 16/11/2013
import java.util.ArrayList;
import java.util.Iterator;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import java.util.Scanner;
import java.io.IOException;
import java.net.URL;
import java.util.List;
public class AnalyzeHtml
{
TagNode ourRootNode;
public AnalyzeHtml(URL ourHtmlPage) throws IOException {
HtmlCleaner ourCleaner = new HtmlCleaner();
ourRootNode = ourCleaner.clean(ourHtmlPage);
}
List getTagsByClass(String passedClassname, String ourByName) {
List ourTagList = new ArrayList();
TagNode ourTagElements[] = ourRootNode.getElementsByName(ourByName, true);
for (int iit = 0; ourTagElements != null && iit < ourTagElements.length; iit++) {
String ourClassType = ourTagElements[iit].getAttributeByName("class");
if (ourClassType != null && ourClassType.equals(passedClassname)) {
ourTagList.add(ourTagElements[iit]);
}
}
return ourTagList;
}
public static void main(String[] args) {
String ourElementType = ""; //"select";
String ourClassName = ""; //"mytuts";
String ourUrl = ""; //"http://www.rjmprogramming.com.au";
try {
ourElementType = args[0];
ourClassName = args[1];
ourUrl = args[2];
} catch (Exception err) {
}
Scanner in = new Scanner(System.in);
if (ourElementType == "") {
System.out.print("Please enter HTML element type to look for [div]: ");
ourElementType = in.nextLine().replace('\n', '\0');
if (ourElementType.compareTo(" ") <= 0) ourElementType = "select";
}
if (ourClassName == "") {
System.out.print("Please enter HTML element class to look for [Normal]: ");
ourClassName = in.nextLine().replace('\n', '\0');
if (ourClassName.compareTo(" ") <= 0) ourClassName = "mytuts";
}
if (ourUrl == "") {
System.out.print("Please enter HTML url to search on [http://www.rjmprogramming.com.au]: ");
ourUrl = in.nextLine().replace('\n', '\0');
if (ourUrl.compareTo(" ") <= 0) ourUrl = "http://www.rjmprogramming.com.au";
}
try {
AnalyzeHtml thisAnalyze = new AnalyzeHtml(new URL(ourUrl));
List ourElements = thisAnalyze.getTagsByClass(ourClassName, ourElementType);
System.out.println("Data of " + ourElementType + "s with class='" + ourClassName + "' at '" + ourUrl + "'");
for (Iterator ourIterator = ourElements.iterator(); ourIterator.hasNext();) {
TagNode ourElement = (TagNode) ourIterator.next();
System.out.println("Text child nodes of " + ourElementType + "s: " + ourElement.getText().toString());
}
} catch (Exception err) {
err.printStackTrace();
}
}
}