|
Re: listing file locations from web page
|
Posted: Apr 1, 2004 5:53 AM
|
|
I believe there are many ways to achieve what you want (many libraries that will parse a html doc for you) Maybe you can use this as a starting point (although it seems strange to use swing, it works): http://java.sun.com/developer/TechTips/1999/tt0923.html
I have changed the example to list img-tags' src attribute:
import java.io.*; import java.net.*; import javax.swing.text.*; import javax.swing.text.html.*;
class GetLinks { public static void main(String[] args) { EditorKit kit = new HTMLEditorKit(); Document doc = kit.createDefaultDocument();
// The Document class does not yet // handle charset's properly. doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE); try {
// Create a reader on the HTML content. //Reader rd = getReader(args[0]); Reader rd = getReader("http://www.domain.tld/test.html");
// Parse the HTML. kit.read(rd, doc, 0);
// Iterate through the elements // of the HTML document. ElementIterator it = new ElementIterator(doc); javax.swing.text.Element elem; while ((elem = it.next()) != null) { SimpleAttributeSet s1 = (SimpleAttributeSet)elem.getAttributes().getAttribute(HTML.Tag.IMG); Object o = elem.getAttributes().getAttribute(HTML.Attribute.SRC); if (o != null) { System.out.println(o); } } } catch (Exception e) { e.printStackTrace(); } System.exit(1); }
// Returns a reader on the HTML data. If 'uri' begins // with "http:", it's treated as a URL; otherwise, // it's assumed to be a local filename. static Reader getReader(String uri) throws IOException { if (uri.startsWith("http:")) {
// Retrieve from Internet. URLConnection conn = new URL(uri).openConnection(); return new InputStreamReader(conn.getInputStream()); } else {
// Retrieve from file. return new FileReader(uri); } } }
|
|