Friday, 10 April 2015

Web Crawler Code in Java


Web Crawler Code in Java



import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class dp {

public static void main(String args[]) throws InterruptedException,IOException
{

try{
String s="https://www.google.co.in";
int page=1;

Set <String> set=new  HashSet <String>();
Queue <String> q=new  LinkedList <String>();

q.add(s);
System.out.println(s);
System.out.println("==================");

while(!q.isEmpty())
{
if(page!=10)
{
String link=q.remove();
URL my_url=new URL(link);
BufferedReader br=new BufferedReader(new InputStreamReader(my_url.openStream()));
String str="";
String strTemp="";
StringTokenizer strtok;

while(null!=(strTemp=br.readLine()))
{
str=str+strTemp;

}

strtok=new StringTokenizer(str,"\"",false);
while(strtok.hasMoreTokens())
{
strTemp=strtok.nextToken();
String regexp="https?://(\\w+\\.)*(\\w+)";
Pattern pattern=Pattern.compile(regexp);
Matcher matcher=pattern.matcher(strTemp);
if(matcher.find())
{
String w=matcher.group();
if(!(set.contains(w)))
{
set.add(w);
q.add(w);
System.out.println(w);

}


}


}

System.out.println("================");
set.clear();
page++;

}
else
break;


}






}
catch(Exception ex)
{
ex.printStackTrace();


}



}


}

No comments:

Post a Comment