|
|
Posted on 11/30/2014 4:56:56 PM
|
|
|
|

[mw_shl_code=java,true]/** * @author Jack.Wang
* */ import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.regex.Matcher; import java.util.regex.Pattern;
Search for web crawlers public class SearchCrawler implements Runnable {
/* * disallowListCache cache robot does not allow searched URLs. The Robot protocol sets a robots.txt file in the root directory of the website, * Specifies which pages on the site are searchable for restricted search. * The search program should skip these areas during the search, here is an example of robots.txt: * # robots.txt for http://somehost.com/ User-agent: * Disallow: /cgi-bin/ * Disallow: /registration # Disallow robots on registration page * Disallow: /login */
private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>(); ArrayList<String> errorList = new ArrayList<String>(); Error message ArrayList<String> result = new ArrayList<String>(); Results searched String startUrl; A starting point to start your search int maxUrl; The maximum number of URLs processed String searchString; Strings to search for boolean caseSensitive = false; Whether or not to be case-sensitive boolean limitHost = false; Whether to search within a restricted host
public SearchCrawler(String startUrl, int maxUrl, String searchString) { this.startUrl = startUrl; this.maxUrl = maxUrl; this.searchString = searchString;
}
public ArrayList<String> getResult() { return result;
}
public void run() { // Start the search thread crawl(startUrl, maxUrl, searchString, limitHost, caseSensitive);
}
Detect URL formatting private URL verifyUrl(String url) { Only HTTP URLs are processed. if (!url.toLowerCase().startsWith("http://")) return null; URL verifiedUrl = null; try { verifiedUrl = new URL(url); } catch (Exception e) { return null; } return verifiedUrl;
}
Detects whether the robot allows access to the given URL. private boolean isRobotAllowed(URL urlToCheck) { String host = urlToCheck.getHost().toLowerCase(); Get the host that gives the RUL System.out.println("host="+host);
Get a cache of URLs that your host doesn't allow searching ArrayList<String> disallowList = disallowListCache.get(host);
If you don't already have a cache, download and cache it. if (disallowList == null) { disallowList = new ArrayList<String>(); try { URL robotsFileUrl = new URL("http://" + host + "/robots.txt"); BufferedReader reader = new BufferedReader( new InputStreamReader(robotsFileUrl.openStream()));
Read the robot file to create a list of paths that are not allowed to be accessed. String line; while ((line = reader.readLine()) != null) { if (line.indexOf("Disallow:") == 0) {// Does it contain "Disallow:" String disallowPath = line.substring("Disallow:" .length()); Get the path to disallowed access
Check for annotations. int commentIndex = disallowPath.indexOf("#"); if (commentIndex != -1) { disallowPath = disallowPath.substring(0, commentIndex); Remove the comment }
disallowPath = disallowPath.trim(); disallowList.add(disallowPath); } }
Cache paths that this host is not allowed to access. disallowListCache.put(host, disallowList); } catch (Exception e) { return true; There are no robots.txt files in the root directory of the website, and it returns true } }
String file = urlToCheck.getFile(); System.out.println("File getFile()="+file); for (int i = 0; i < disallowList.size(); i++) { String disallow = disallowList.get(i); if (file.startsWith(disallow)) { return false; } }
return true;
}
private String downloadPage(URL pageUrl) { try { // Open connection to URL for reading. BufferedReader reader = new BufferedReader(new InputStreamReader( pageUrl.openStream()));
// Read page into buffer. String line; StringBuffer pageBuffer = new StringBuffer(); while ((line = reader.readLine()) != null) { pageBuffer.append(line); }
return pageBuffer.toString(); } catch (Exception e) { }
return null;
}
Remove "www" from the URL private String removeWwwFromUrl(String url) { int index = url.indexOf("://www."); if (index != -1) { return url.substring(0, index + 3) + url.substring(index + 7); }
return (url);
}
Parse the page and find the link private ArrayList<String> retrieveLinks(URL pageUrl, String pageContents, HashSet crawledList, boolean limitHost) { Compile the matching pattern of the link with regular expressions. Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(. *?) [\"|>]", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(pageContents);
ArrayList<String> linkList = new ArrayList<String>(); while (m.find()) { String link = m.group(1).trim();
if (link.length() < 1) { continue; }
Skip the link to this page. if (link.charAt(0) == '#') { continue; }
if (link.indexOf("mailto:") != -1) { continue; }
if (link.toLowerCase().indexOf("javascript") != -1) { continue; }
if (link.indexOf("://") == -1) { if (link.charAt(0) == '/') {// handles absolutely link = "http://" + pageUrl.getHost() + ":" + pageUrl.getPort() + link; } else { String file = pageUrl.getFile(); if (file.indexOf('/') == -1) {// handles relative addresses link = "http://" + pageUrl.getHost() + ":" + pageUrl.getPort() + "/" + link; } else { String path = file.substring(0, file.lastIndexOf('/') + 1); link = "http://" + pageUrl.getHost() + ":" + pageUrl.getPort() + path + link; } } }
int index = link.indexOf('#'); if (index != -1) { link = link.substring(0, index); }
link = removeWwwFromUrl(link);
URL verifiedLink = verifyUrl(link); if (verifiedLink == null) { continue; }
/* If you restrict hosts, exclude those URLs that don't meet the criteria */ if (limitHost && !pageUrl.getHost().toLowerCase().equals( verifiedLink.getHost().toLowerCase())) { continue; }
Skip those links that have already been processed. if (crawledList.contains(link)) { continue; }
linkList.add(link); }
return (linkList);
}
Search the content of a downloaded web page to determine if there is a specified search string in the page
private boolean searchStringMatches(String pageContents, String searchString, boolean caseSensitive) { String searchContents = pageContents; if (!caseSensitive) {// if case insensitive searchContents = pageContents.toLowerCase(); }
Pattern p = Pattern.compile("[\\s]+"); String[] terms = p.split(searchString); for (int i = 0; i < terms.length; i++) { if (caseSensitive) { if (searchContents.indexOf(terms) == -1) { return false; } } else { if (searchContents.indexOf(terms.toLowerCase()) == -1) { return false; } } }
return true;
}
Perform the actual search operation public ArrayList<String> crawl(String startUrl, int maxUrls, String searchString, boolean limithost, boolean caseSensitive) {
HashSet<String> crawledList = new HashSet<String>(); LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();
if (maxUrls < 1) { errorList.add("Invalid Max URLs value."); System.out.println("Invalid Max URLs value."); }
if (searchString.length() < 1) { errorList.add("Missing Search String."); System.out.println("Missing search String"); }
if (errorList.size() > 0) { System.out.println("err!!!"); return errorList; }
Move www out of the start URL startUrl = removeWwwFromUrl(startUrl);
toCrawlList.add(startUrl); while (toCrawlList.size() > 0) {
if (maxUrls != -1) { if (crawledList.size() == maxUrls) { break; } }
// Get URL at bottom of the list. String url = toCrawlList.iterator().next();
// Remove URL from the to crawl list. toCrawlList.remove(url);
// Convert string url to URL object. URL verifiedUrl = verifyUrl(url);
// Skip URL if robots are not allowed to access it. if (!isRobotAllowed(verifiedUrl)) { continue; }
Add processed URLs to crawledList crawledList.add(url); String pageContents = downloadPage(verifiedUrl);
if (pageContents != null && pageContents.length() > 0) { Get a valid link from the page ArrayList<String> links = retrieveLinks(verifiedUrl, pageContents, crawledList, limitHost);
toCrawlList.addAll(links);
if (searchStringMatches(pageContents, searchString, caseSensitive)) { result.add(url); System.out.println(url); } }
} return result;
}
main function public static void main(String[] args) { SearchCrawler crawler = new SearchCrawler("http://www.itsvse.com/", 100,"Delver_Si"); Thread search = new Thread(crawler); System.out.println("Start searching..."); System.out.println("result:"); search.start(); try { search.join(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); }
}
} [/mw_shl_code]
|
-
1.png
(100.02 KB, Number of downloads: 591)
-
2.png
(72.18 KB, Number of downloads: 596)
-
3.png
(188.46 KB, Number of downloads: 606)
Previous:PHP accounts for 82.2% of the top 100W websitesNext:Java Edition Web Source Code Viewer
|