Java version of web crawler

Delver_Si · Posted on 11/30/2014 4:56:56 PM

[mw_shl_code=java,true]/**
* @author Jack.Wang
*
*/
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Search for web crawlers
public class SearchCrawler implements Runnable {

/*
  * disallowListCache cache robot does not allow searched URLs. The Robot protocol sets a robots.txt file in the root directory of the website,
  * Specifies which pages on the site are searchable for restricted search.
  * The search program should skip these areas during the search, here is an example of robots.txt:
  * # robots.txt for http://somehost.com/ User-agent:
  * Disallow: /cgi-bin/
  * Disallow: /registration # Disallow robots on registration page
  * Disallow: /login
  */

private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
ArrayList<String> errorList = new ArrayList<String>(); Error message
ArrayList<String> result = new ArrayList<String>(); Results searched
String startUrl; A starting point to start your search
int maxUrl; The maximum number of URLs processed
String searchString; Strings to search for
boolean caseSensitive = false; Whether or not to be case-sensitive
boolean limitHost = false; Whether to search within a restricted host

public SearchCrawler(String startUrl, int maxUrl, String searchString) {
  this.startUrl = startUrl;
  this.maxUrl = maxUrl;
  this.searchString = searchString;
}

public ArrayList<String> getResult() {
  return result;
}

public void run() { // Start the search thread
  crawl(startUrl, maxUrl, searchString, limitHost, caseSensitive);
}

Detect URL formatting
private URL verifyUrl(String url) {
  Only HTTP URLs are processed.
  if (!url.toLowerCase().startsWith("http://"))
return null;
  URL verifiedUrl = null;
  try {
verifiedUrl = new URL(url);
  } catch (Exception e) {
return null;
  }
  return verifiedUrl;
}

Detects whether the robot allows access to the given URL.
private boolean isRobotAllowed(URL urlToCheck) {
  String host = urlToCheck.getHost().toLowerCase(); Get the host that gives the RUL
  System.out.println("host="+host);

  Get a cache of URLs that your host doesn't allow searching
  ArrayList<String> disallowList = disallowListCache.get(host);

  If you don't already have a cache, download and cache it.
  if (disallowList == null) {
disallowList = new ArrayList<String>();
try {
URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
BufferedReader reader = new BufferedReader(
   new InputStreamReader(robotsFileUrl.openStream()));

Read the robot file to create a list of paths that are not allowed to be accessed.
String line;
while ((line = reader.readLine()) != null) {
   if (line.indexOf("Disallow:") == 0) {// Does it contain "Disallow:"
   String disallowPath = line.substring("Disallow:"
      .length()); Get the path to disallowed access

   Check for annotations.
   int commentIndex = disallowPath.indexOf("#");
   if (commentIndex != -1) {
   disallowPath = disallowPath.substring(0,
      commentIndex); Remove the comment
   }

   disallowPath = disallowPath.trim();
   disallowList.add(disallowPath);
   }
}

Cache paths that this host is not allowed to access.
disallowListCache.put(host, disallowList);
} catch (Exception e) {
return true; There are no robots.txt files in the root directory of the website, and it returns true
}
  }

  String file = urlToCheck.getFile();
  System.out.println("File getFile()="+file);
  for (int i = 0; i < disallowList.size(); i++) {
String disallow = disallowList.get(i);
if (file.startsWith(disallow)) {
return false;
}
  }

  return true;
}

private String downloadPage(URL pageUrl) {
  try {
// Open connection to URL for reading.
BufferedReader reader = new BufferedReader(new InputStreamReader(
   pageUrl.openStream()));

// Read page into buffer.
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line);
}

return pageBuffer.toString();
  } catch (Exception e) {
  }

  return null;
}

Remove "www" from the URL
private String removeWwwFromUrl(String url) {
  int index = url.indexOf("://www.");
  if (index != -1) {
return url.substring(0, index + 3) + url.substring(index + 7);
  }

  return (url);
}

Parse the page and find the link
private ArrayList<String> retrieveLinks(URL pageUrl, String pageContents,
HashSet crawledList, boolean limitHost) {
  Compile the matching pattern of the link with regular expressions.
  Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(. *?) [\"|>]",
Pattern.CASE_INSENSITIVE);
  Matcher m = p.matcher(pageContents);

  ArrayList<String> linkList = new ArrayList<String>();
  while (m.find()) {
String link = m.group(1).trim();

if (link.length() < 1) {
continue;
}

Skip the link to this page.
if (link.charAt(0) == '#') {
continue;
}

if (link.indexOf("mailto:") != -1) {
continue;
}

if (link.toLowerCase().indexOf("javascript") != -1) {
continue;
}

if (link.indexOf("://") == -1) {
if (link.charAt(0) == '/') {// handles absolutely
   link = "http://" + pageUrl.getHost() + ":"
   + pageUrl.getPort() + link;
} else {
   String file = pageUrl.getFile();
   if (file.indexOf('/') == -1) {// handles relative addresses
   link = "http://" + pageUrl.getHost() + ":"
      + pageUrl.getPort() + "/" + link;
   } else {
   String path = file.substring(0,
      file.lastIndexOf('/') + 1);
   link = "http://" + pageUrl.getHost() + ":"
      + pageUrl.getPort() + path + link;
   }
}
}

int index = link.indexOf('#');
if (index != -1) {
link = link.substring(0, index);
}

link = removeWwwFromUrl(link);

URL verifiedLink = verifyUrl(link);
if (verifiedLink == null) {
continue;
}

/* If you restrict hosts, exclude those URLs that don't meet the criteria */
if (limitHost
   && !pageUrl.getHost().toLowerCase().equals(
   verifiedLink.getHost().toLowerCase())) {
continue;
}

Skip those links that have already been processed.
if (crawledList.contains(link)) {
continue;
}

linkList.add(link);
  }

  return (linkList);
}

Search the content of a downloaded web page to determine if there is a specified search string in the page

private boolean searchStringMatches(String pageContents,
String searchString, boolean caseSensitive) {
  String searchContents = pageContents;
  if (!caseSensitive) {// if case insensitive
searchContents = pageContents.toLowerCase();
  }

  Pattern p = Pattern.compile("[\\s]+");
  String[] terms = p.split(searchString);
  for (int i = 0; i < terms.length; i++) {
if (caseSensitive) {
if (searchContents.indexOf(terms) == -1) {
   return false;
}
} else {
if (searchContents.indexOf(terms.toLowerCase()) == -1) {
   return false;
}
}
  }

  return true;
}

Perform the actual search operation
public ArrayList<String> crawl(String startUrl, int maxUrls,
String searchString, boolean limithost, boolean caseSensitive) {

  HashSet<String> crawledList = new HashSet<String>();
  LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();

  if (maxUrls < 1) {
errorList.add("Invalid Max URLs value.");
System.out.println("Invalid Max URLs value.");
  }

  if (searchString.length() < 1) {
errorList.add("Missing Search String.");
System.out.println("Missing search String");
  }

  if (errorList.size() > 0) {
System.out.println("err!!!");
return errorList;
  }

  Move www out of the start URL
  startUrl = removeWwwFromUrl(startUrl);

  toCrawlList.add(startUrl);
  while (toCrawlList.size() > 0) {

if (maxUrls != -1) {
if (crawledList.size() == maxUrls) {
   break;
}
}

// Get URL at bottom of the list.
String url = toCrawlList.iterator().next();

// Remove URL from the to crawl list.
toCrawlList.remove(url);

// Convert string url to URL object.
URL verifiedUrl = verifyUrl(url);

// Skip URL if robots are not allowed to access it.
if (!isRobotAllowed(verifiedUrl)) {
continue;
}

Add processed URLs to crawledList
crawledList.add(url);
String pageContents = downloadPage(verifiedUrl);

if (pageContents != null && pageContents.length() > 0) {
Get a valid link from the page
ArrayList<String> links = retrieveLinks(verifiedUrl,
   pageContents, crawledList, limitHost);

toCrawlList.addAll(links);

if (searchStringMatches(pageContents, searchString,
   caseSensitive)) {
   result.add(url);
   System.out.println(url);
}
}

  }
  return result;
}

main function
public static void main(String[] args) {
  SearchCrawler crawler = new SearchCrawler("http://www.itsvse.com/", 100,"Delver_Si");
  Thread search = new Thread(crawler);
  System.out.println("Start searching...");
  System.out.println("result:");
  search.start();
  try {
search.join();
  } catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
  }
}
}
[/mw_shl_code]

test · Posted on 11/30/2014 6:07:24 PM

Seeing the news that Jack Ma was still 300 million away from catching up with Li Ka-shing to become the richest man in Asia, I chuckled in my heart, and quickly checked my ranking on the Internet, which was basically not affected, and the ranking remained at about 1.4 billion, now I am relieved! In fact, I am also a person with dreams, since I was a child, I dreamed that one day I would drive a Lamborghini sports car back to our hometown with sunglasses, after nearly 20 years of hard work, now the dream is half realized, I already have my own sunglasses, only a sports car, the dream is about to become a reality, and I am a little excited when I think about it!

Delver_Si · Posted on 11/30/2014 6:28:24 PM

Clearly you didn't hint at me Posted on 2014-11-30 18:08
What do you mean, I don't understand

A web crawler (also known as a web spider, a web bot, more often called a web chaser in the FOAF community) is a program or script that automatically scrapes information about the World Wide Web according to certain rules. Other less commonly used names are ants, auto-indexes, emulators, or worms.

admin · Posted on 11/30/2014 5:40:16 PM

Delver_Si Posted on 2014-11-30 17:22
I want to write a tool to explode mssql ftp and other tools, but it is not easy to use online

Write it, I've got your back! Demolition tools are generally written in .net, and there are not many written in Java, so you can write one in Java!

admin · Posted on 11/30/2014 5:19:07 PM

I wipe, this is a good puppet for making a hacking tool!

wasps · Posted on 11/30/2014 5:22:22 PM

I collected it first, although it is useless now, I believe it will be useful in the future!

Delver_Si · Posted on 11/30/2014 5:22:42 PM

I want to write a tool to explode mssql ftp and other tools, but it is not easy to use online

I know you haven't hinted at me · Posted on 11/30/2014 6:08:16 PM

What do you mean, I don't understand

Delver_Si · Posted on 11/30/2014 6:32:40 PM

admin posted on 2014-11-30 17:40
Write it, I've got your back! Demolition tools are generally written in .net, and there are not many written in Java, so you can write one in Java!

The mssql blaster in the kit can't even blast my own,

Delver_Si · Posted on 11/30/2014 6:33:09 PM

test posted on 2014-11-30 18:07
Seeing the news that Jack Ma was still 300 million short of catching up with Li Ka-shing to become the richest man in Asia, his heart clicked, and he quickly checked his ranking on the Internet, and he was basically not affected...

You're the webmaster's vest, aren't you?"

[Java Source Code] Java version of web crawler

Related Posts