import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.CookieHandler; import java.net.CookieManager; import java.net.CookiePolicy; import java.net.HttpURLConnection; import java.net.URL; import java.nio.charset.Charset; import java.util.List;
/**
- Obtain administrative divisions from the National Bureau of Statistics
- NBOS(National Bureau of Statistics)
*/
/** * Read Provincial Information * @param args * @throws Exception */ public static void main(String[] args) throws Exception { String url = baseUrl + "index.html"; //If you need to set up a proxy //initProxy("xx.xx.xx.xx", "xx"); String str = getContent(url).toUpperCase(); String[] arrs = str.split("<A"); for (String s : arrs) { if (s.indexOf("HREF") != -1 && s.indexOf(".HTML") != -1) { String a = s.substring(7, s.indexOf("'>")); String provinceCode = a.substring(0, 2) + "0000000000"; System.out.println("provincial level CODE:" + provinceCode); String name = s.substring(s.indexOf("'>")+2, s.indexOf("<BR/>")); if(!"Beijing City".equals(name)){ continue; } System.out.println("Acquiring provinces:"+name); readShi(a,name); } } }
/** * Read city data * @param list * @throws Exception */ public static void readShi(String url,String name) throws Exception{ String content = getContent(baseUrl+url).toUpperCase(); String[] citys = content.split("CITYTR"); //'> td > < a href ='11 / 1101. HTML' > 110100000000 < / a > < td > < a href ='11 / 1101. HTML '> municipal district < / a > < td > < tr class for(int c=1,len=citys.length; c<len; c++){ String[] strs = citys[c].split("<A HREF='"); String cityUrl = null; String cityName = null; for(int si = 1; si<3; si++){ if(si==1){//Link and code cityUrl = strs[si].substring(0, strs[si].indexOf("'>")); String cityCode = strs[si].substring(strs[si].indexOf("'>")+2, strs[si].indexOf("</A>")); System.out.println("cityCode:" + cityCode); }else{ cityName = name+strs[si].substring(strs[si].indexOf("'>")+2, strs[si].indexOf("</A>")); System.out.println("Get city:" + cityName); } } readXian(cityUrl.substring(0, cityUrl.indexOf("/")+1),cityUrl,cityName); } } /** * Read county data * @param url * @throws Exception */ public static void readXian(String prix,String url,String cityName) throws Exception{ String content = getContent(baseUrl+url).toUpperCase(); String[] citys = content.split("COUNTYTR"); for(int i=1; i<citys.length; i++){ String cityUrl = null; String areaName = null; if(citys[i].indexOf("<A HREF='")==-1){ String cityCode = citys[i].substring(6, 18); System.out.println("AreaCode:"+cityCode); areaName = cityName + citys[i].substring(citys[i].indexOf("</TD><TD>")+9,citys[i].lastIndexOf("</TD>")); System.out.println("Get city area:" + areaName); }else{ String[] strs = citys[i].split("<A HREF='"); for(int si = 1; si<3; si++){ if(si==1){//Link and code cityUrl = strs[si].substring(0, strs[si].indexOf("'>")); String cityCode = strs[si].substring(strs[si].indexOf("'>")+2, strs[si].indexOf("</A>")); System.out.println("AreaCode:"+cityCode); }else{ areaName = cityName+strs[si].substring(strs[si].indexOf("'>")+2, strs[si].indexOf("</A>")); System.out.println("Get the city(Or county city):" + areaName); } } } if(null!=cityUrl){ readZhen(prix,cityUrl,areaName); } } } /** * Read town's data * @param url * @throws Exception */ public static void readZhen(String prix,String url,String areaName) throws Exception{ String content = getContent(baseUrl+prix+url).toUpperCase(); String myPrix = (prix+url).substring(0, (prix+url).lastIndexOf("/")+1); String[] citys = content.split("TOWNTR"); for(int i=1; i<citys.length; i++){ String[] strs = citys[i].split("<A HREF='"); String cityUrl = null; String towntrName = null; for(int si = 1; si<3; si++){ if(si==1){//Link and code cityUrl = strs[si].substring(0, strs[si].indexOf("'>")); String cityCode = strs[si].substring(strs[si].indexOf("'>")+2, strs[si].indexOf("</A>")); System.out.println(cityCode); }else{ towntrName = areaName+strs[si].substring(strs[si].indexOf("'>")+2, strs[si].indexOf("</A>")); System.out.println("Obtain the township (sub district office):" + towntrName); } } readCun(myPrix,cityUrl,towntrName); } } /** * Read village / street data * @param url * @throws Exception */ public static void readCun(String prix,String url,String towntrName) throws Exception{ String content = getContent(baseUrl+prix+url).toUpperCase(); String[] citys = content.split("VILLAGETR"); for(int i=1; i<citys.length; i++){ //Code of village organization String villageCode = null; //Urban and rural classification code String cxfldm = null; //Village name String cunName = null; String[] strs = citys[i].split("<TD>"); villageCode = strs[1].substring(0, strs[1].indexOf("</TD>")); cxfldm = strs[2].substring(0, strs[2].indexOf("</TD>")); cunName = towntrName + strs[3].substring(0, strs[3].indexOf("</TD>")); System.out.println("villageCode: " + villageCode + "***cxfldm:" + cxfldm); System.out.println("Village data:" + cunName); } } //Setting agent public static void initProxy(String host, String port) { System.setProperty("http.proxyType", "4"); System.setProperty("http.proxyPort", port); System.setProperty("http.proxyHost", host); System.setProperty("http.proxySet", "true"); } //Get the content of a web page public static String getContent(String strUrl) throws Exception { try { //Prevent the cookie from not being checked, and redirect repeatedly CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); URL url = new URL(strUrl); java.net.HttpURLConnection conn = (HttpURLConnection) url.openConnection(); //Because no verification passed, it did not jump to the next address, or return the current address. //As a result, the same address constantly jumps back to itself and becomes a dead cycle. After 20 times, it's abnormal //java.net.ProtocolException: Server redirected too many times (20) conn.setInstanceFollowRedirects(false); BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName(CHARSET))); String s = ""; StringBuffer sb = new StringBuffer(""); while ((s = br.readLine()) != null) { sb.append(s); } br.close(); return sb.toString(); } catch (Exception e) { System.out.println("can't open url:"+strUrl); throw e; } }
}
Be careful:
1. The National Bureau of statistics has access restrictions, so it is not suitable to obtain all the data of the whole country at one time. It is suggested that the data can be divided into provinces and obtained by sections 2. The acquired year can be replaced; 3. It can be written into the database according to the requirements, which is convenient for use and can be updated and maintained once a year; 4. The program can select the required provincial, municipal, county, township (street), village (residential) committees according to the needs, with the code of 12 digits