Web Crawler Code in Java
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class dp {
public static void main(String args[]) throws InterruptedException,IOException
{
try{
String s="https://www.google.co.in";
int page=1;
Set <String> set=new HashSet <String>();
Queue <String> q=new LinkedList <String>();
q.add(s);
System.out.println(s);
System.out.println("==================");
while(!q.isEmpty())
{
if(page!=10)
{
String link=q.remove();
URL my_url=new URL(link);
BufferedReader br=new BufferedReader(new InputStreamReader(my_url.openStream()));
String str="";
String strTemp="";
StringTokenizer strtok;
while(null!=(strTemp=br.readLine()))
{
str=str+strTemp;
}
strtok=new StringTokenizer(str,"\"",false);
while(strtok.hasMoreTokens())
{
strTemp=strtok.nextToken();
String regexp="https?://(\\w+\\.)*(\\w+)";
Pattern pattern=Pattern.compile(regexp);
Matcher matcher=pattern.matcher(strTemp);
if(matcher.find())
{
String w=matcher.group();
if(!(set.contains(w)))
{
set.add(w);
q.add(w);
System.out.println(w);
}
}
}
System.out.println("================");
set.clear();
page++;
}
else
break;
}
}
catch(Exception ex)
{
ex.printStackTrace();
}
}
}
No comments:
Post a Comment