新浪微博 登陆  注册   设为首页 加入收藏

学PHP >> PHP >> 网络爬虫单线程的实现

网络爬虫单线程的实现

查看次数5036 发表时间2013-06-08 04:41:40

参考参考链接多线程具体没看懂先实现单线程爬取百度的数百个链接菜鸟自娱,不喜勿喷1.建立连接并爬取本URL的所有链接 (筛选链接的方法具体看下面)  @1:  importjava.io.BufferedReader;...


参考参考链接
多线程具体没看懂先实现单线程爬取百度的数百个链接大笑菜鸟自娱,不喜勿喷
1.建立连接并爬取本URL的所有链接 (筛选链接的方法具体看下面)
  @1:  importjava.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;

public class Gather {
    URLurl;

    publicGather(URL url) {
       this.url =url;
    }

public static ArrayList SearchURL(URL url){
    StringBufferdocument = new StringBuffer("");
 
    try {

      URLConnection conn = url.openConnection();

      BufferedReader reader = new BufferedReader(newInputStreamReader(
            conn.getInputStream()));
       String line= null;

       while ((line= reader.readLine()) != null) {
          byte[] b =line.getBytes("ISO-8859-1");
          line = newString(b);
         
         document.append(line + " ");
       }
    } catch(Exception e) {

      e.printStackTrace();
    }

    ArrayListlist = new ArrayList();
    list = newUrlDetector().urlDetector(new String(document));
    returnlist;


}
}
@2:
//初次匹配到的url是形如:
      //为此,需要进行下一步的处理,把真正的url抽取出来,
      //可以对于前两个"之间的部分进行记录得到url
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class UrlDetector {

    publicArrayList urlDetector(String htmlDoc){
      
       final StringpatternString ="<[a|A]\s+href=([^>]*\s*>)";          
       Patternpattern =Pattern.compile(patternString,Pattern.CASE_INSENSITIVE);  
       ArrayListallURLs =   newArrayList();
       Matchermatcher = pattern.matcher(htmlDoc);
       StringtempURL;
     
      while(matcher.find()){
          tempURL =matcher.group();  
         
          tempURL =tempURL.substring(tempURL.indexOf(""")+1);       
         if(!tempURL.contains("""))
            continue;
          tempURL =tempURL.substring(0, tempURL.indexOf("""));
          try {
            if(tempURL.contains("http://"))
            allURLs.add(new URL(tempURL));
          } catch(MalformedURLException e) {
         
            e.printStackTrace();
          }
       }
      
       returnallURLs;   
    }
}
@3:
实现简单的URL分配
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;


public  class Dispatcher {
    privatestatic ArrayList urlpool  = new ArrayList();
    publicstatic void setUrlpool(ArrayList urlpool) {
      Dispatcher.urlpool = urlpool;
    }
    publicstatic ArrayList getUrlpool() {
       returnurlpool;
    }
    privatestatic Dispatcher dispatcher ;
    privateDispatcher() {
   
       try {
         
         urlpool.add(new URL("http://www.baidu.com"));
      
       } catch(MalformedURLException e) {
         
         e.printStackTrace();
       }
       dispatcher =this;
    }
    publicstatic synchronized  Dispatcher getInstance(){
      if(dispatcher==null)
          dispatcher =new Dispatcher();
       returndispatcher;
    }

}
@4:
主函数
import java.net.URL;
import java.util.ArrayList;

public class Splier {

    staticArrayList runlist;

    publicstatic void main(String[] args) {

       Dispatcherdisp = Dispatcher.getInstance();
       runlist =disp.getUrlpool();
       for (int i =0; i < runlist.size(); i++)

       {
//         System.out.println("start  " +runlist.size());
//         System.out.println("Spli");
          //GathersGather =new Gather(runlist.get(i));
         runlist.addAll(Gather.SearchURL(runlist.get(i)));
          //Threadgather = new Thread(new Gather(runlist.get(i)));

         //gather.start();
         if(runlist.size()>100)
            break;
    
(转发请注明转自:
学PHP)    


  相关推荐




  发表评论
昵称:
(不超过20个字符或10个汉字)
内容: