httpclient crawler crawls movie information and downloads address instances

This update mainly solves the problem that the download links of old pages may be in Thunderbolt and ftp format, and that they should be duplicated, because each page has a recommendation list, which will also contain relevant detailed links, as well as compatible with other page formats. Two methods are updated:

public static void spider(int pa) {
        List<String> page = getPage(pa);
        String[] abc = "http://www.***.net/ys/20170620/37704.htm, http://www.***.net/ys/20170727/38028.htm, http://www.***.net/ys/20170810/38113.htm, http://www.***.net/ys/20170703/37769.htm, http://www.***.net/ys/20170615/37680.htm, http://www.***.net/ys/20170615/37678.htm, http://www.***.net/ys/20170727/38027.htm, http://www.***.net/ys/20170802/38060.htm, http://www.***.net/ys/20170515/37385.htm, http://www.***.net/ys/20170725/38001.htm, http://www.***.net/ys/20170608/37614.htm, http://www.***.net/ys/20170802/38059.htm, http://www.***.net/ys/20170629/37742.htm, http://www.***.net/ys/20170512/37323.htm, http://www.***.net/ys/20170426/37219.htm, http://www.***.net/ys/20170727/38026.htm, http://www.***.net/ys/20170730/38046.htm, http://www.***.net/ys/20170804/38082.htm, http://www.***.net/ys/20170714/37848.htm, http://www.***.net/ys/20180819/40982.htm, http://www.***.net/ys/20180819/40981.htm, http://www.***.net/ys/20180818/40980.htm, http://www.***.net/ys/20180818/40979.htm, http://www.***.net/ys/20180818/40978.htm, http://www.***.net/ys/20180818/40977.htm, http://www.***.net/ys/20180817/40975.htm, http://www.***.net/ys/20180817/40974.htm".split(", ");
        List<String> list = Arrays.asList(abc);
        page.removeAll(list);
        output(page.size());
        Set<String> truelist = new HashSet<>();
        page.forEach(l -> truelist.add(l));
        truelist.forEach(p -> {
            try {
                getMovieInfo(p);
                sleep(getRandomInt(3) + 3);
            } catch (Exception e) {
                output(p);
            }
        });
    }
 
    public static void spider(String text) {
        List<String> page = getPage(text);
        Set<String> truelist = new HashSet<>();
        page.forEach(l -> truelist.add(l));
        truelist.forEach(p -> {
            try {
                getMovieInfo(p);
                sleep(getRandomInt(3));
            } catch (Exception e) {
                output(p);
            }
        });
    }
 
    public static List<String> getPage(int page) {
        String url = "http://www.***.net/ys/index_" + page + ".htm";
        if (page == 1) url = "http://www.***.net/ys/";
        output(url);
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String content = response.getString("content");
        byte[] bytes = content.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm");
        return list;
    }
 
    public static List<String> getPage(String page) {
        String content = page;
        byte[] bytes = content.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm");
        return list;
    }
 
    public static boolean getMovieInfo(int day, int index) {
//        String url = "http://www.***.net/ys/20180819/40981.htm";
        String url = "http://www.***.net/ys/" + day + "/" + index + ".htm";
        getMovieInfo(url);
        return true;
    }
 
    public static boolean getMovieInfo(String url) {
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String s = response.getString("content");
        if (s.contains("The content you queried does not exist. Please go back to the home page and search again.")) return false;
        byte[] bytes = s.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        String name = EMPTY, tname = EMPTY, year = EMPTY, language = EMPTY, date = EMPTY, score = EMPTY, length = EMPTY, author = EMPTY;
        if (all.contains("◎")) {
            int i = all.indexOf("◎");
            int i1 = all.indexOf("<hr");
            String info = s.substring(i, i1);
            name = getInfo(info, "slice　　name　");
            tname = getInfo(info, "translate　　name　");
            year = getInfo(info, "year　　generation　");
            language = getInfo(info, "language　　word　");
            date = getInfo(info, "Release date　");
            score = getInfo(info, "Bean score　");
            length = getInfo(info, "slice　　long　");
            author = getInfo(info, "guide　　Play　");
        } else {
            name = getInfo(all, "<title>");
            if (name.contains("_")) name = name.substring(0, name.indexOf("_"));
            length = getInfo(all, "Film length: ");
            date = getInfo(all, "Release date: ");
            author = getInfo(all, "director: ");
            language = getInfo(all, "language: ");
        }
        List<String> magnets = regexAll(all, "magnet:.+?>");
        List<String> ed2ks = regexAll(all, "ed2k:.+?>");
        if (ed2ks.size() == 0) ed2ks = regexAll(all, "ftp://.+?>");
        if (ed2ks.size() == 0) ed2ks = regexAll(all, "thunder://.+?>");
        List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>");
        String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");";
        sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY));
        if (ed2ks.size() != 0) MySqlTest.sendWork(sql);
        output(magnets.toString().length(), ed2ks.toString().length(), pans.toString().length());
        output(sql);
        return true;
    }
 
    public static String getInfo(String text, String start) {
        String value = EMPTY;
        List<String> nameinfo = regexAll(text, start + ".+?<");
        if (nameinfo.size() > 0) value = nameinfo.get(0).replace(start, EMPTY).replace("<", EMPTY);
        return value;
    }

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In the process of using httpclient crawler, I want to crawl the download address of a movie website that I care about. After experimenting, I finally managed to crawl information and download addresses (e-donkey and magnetic links) from hundreds of popular movies. In the middle, encoding format, inconsistent regular matching, and duplicate link filtering are also solved. Attached is the code for your reference.

Key information hiding, ideas for your reference: first to visit the list page, get the links to the details page, then to visit the details page, get the relevant information and download address, stored in the database.

public class MyTest extends ApiLibrary {
    public static void main(String[] args) {
        DEFAULT_CHARSET = GB2312;
        for (int i = 0; i < 10; i++) {
            spider(1);
        }
 
        testOver();
    }
 
    public static void spider(int pa) {
        String[] abc = "http://www.***.net/ys/20170620/37704.htm, http://www.***.net/ys/20170727/38028.htm, http://www.***.net/ys/20170810/38113.htm, http://www.***.net/ys/20170703/37769.htm, http://www.***.net/ys/20170615/37680.htm, http://www.***.net/ys/20170615/37678.htm, http://www.***.net/ys/20170727/38027.htm, http://www.***.net/ys/20170802/38060.htm, http://www.***.net/ys/20170515/37385.htm, http://www.***.net/ys/20170725/38001.htm, http://www.***.net/ys/20170608/37614.htm, http://www.***.net/ys/20170802/38059.htm, http://www.***.net/ys/20170629/37742.htm, http://www.***.net/ys/20170512/37323.htm, http://www.***.net/ys/20170426/37219.htm, http://www.***.net/ys/20170727/38026.htm, http://www.***.net/ys/20170730/38046.htm, http://www.***.net/ys/20170804/38082.htm, http://www.***.net/ys/20170714/37848.htm, http://www.***.net/ys/20180819/40982.htm, http://www.***.net/ys/20180819/40981.htm, http://www.***.net/ys/20180818/40980.htm, http://www.***.net/ys/20180818/40979.htm, http://www.***.net/ys/20180818/40978.htm, http://www.***.net/ys/20180818/40977.htm, http://www.***.net/ys/20180817/40975.htm, http://www.***.net/ys/20180817/40974.htm".split(", ");
        List<String> list = Arrays.asList(abc);
        page.removeAll(list);
        Set<String> truelist = new HashSet<>();
        page.forEach(l -> truelist.add(l));
        truelist.forEach(p -> {
            try {
                getMovieInfo(p);
                sleep(getRandomInt(3));
            } catch (Exception e) {
                output(p);
            }
        });
    }
 
    public static List<String> getPage(int page) {
        String url = "http://www.***.net/ys/index_" + page + ".htm";
        if (page == 1) url = "http://www.***.net/ys/";
        output(url);
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String content = response.getString("content");
        output(content);
        byte[] bytes = content.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm");
        return list;
    }
 
    public static boolean getMovieInfo(int day, int index) {
//        String url = "http://www.***.net/ys/20180819/40981.htm";
        String url = "http://www.***.net/ys/" + day + "/" + index + ".htm";
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String s = response.getString("content");
        if (s.contains("The content you queried does not exist. Please go back to the home page and search again.")) return false;
        byte[] bytes = s.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        int i = all.indexOf("◎");
        int i1 = all.indexOf("<hr");
        String info = s.substring(i, i1);
        String name = getInfo(info, "slice　　name　");
        String tname = getInfo(info, "translate　　name　");
        String year = getInfo(info, "year　　generation　");
        String language = getInfo(info, "language　　word　");
        String date = getInfo(info, "Release date　");
        String score = getInfo(info, "Bean score　");
        String length = getInfo(info, "slice　　long　");
        String author = getInfo(info, "guide　　Play　");
        List<String> magnets = regexAll(all, "magnet:.+?>");
        List<String> ed2ks = regexAll(all, "ed2k:.+?>");
        List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>");
        String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");";
        sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY));
        MySqlTest.sendWork(sql);
        return true;
    }
 
    public static boolean getMovieInfo(String url) {
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String s = response.getString("content");
        if (s.contains("The content you queried does not exist. Please go back to the home page and search again.")) return false;
        byte[] bytes = s.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        int i = all.indexOf("◎");
        int i1 = all.indexOf("<hr");
        String info = s.substring(i, i1);
        String name = getInfo(info, "slice　　name　");
        String tname = getInfo(info, "translate　　name　");
        String year = getInfo(info, "year　　generation　");
        String language = getInfo(info, "language　　word　");
        String date = getInfo(info, "Release date　");
        String score = getInfo(info, "Bean score　");
        String length = getInfo(info, "slice　　long　");
        String author = getInfo(info, "guide　　Play　");
        List<String> magnets = regexAll(all, "magnet:.+?>");
        List<String> ed2ks = regexAll(all, "ed2k:.+?>");
        List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>");
        String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");";
        sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY));
        MySqlTest.sendWork(sql);
        output(magnets.toString().length(), ed2ks.toString().length(), pans.toString().length());
        output(sql);
        return true;
    }
 
    public static String getInfo(String text, String start) {
        String value = EMPTY;
        List<String> nameinfo = regexAll(text, start + ".+?<");
        if (nameinfo.size() > 0) value = nameinfo.get(0).replace(start, EMPTY).replace("<", EMPTY);
        return value;
    }
 
}

The following is a screenshot of the data warehouse:

Selection of Technical Articles

Selection of non-technical articles

Click on the Public Number Map

Keywords: Programming SQL Java ftp Linux

Added by BITRU on Mon, 16 Sep 2019 11:28:01 +0300

Programming VIP

httpclient crawler crawls movie information and downloads address instances

Selection of Technical Articles

Selection of non-technical articles

Click on the Public Number Map

Popular Keywords