HttpClient在java中的使用http://www.bieryun.com/1011.html
项目中一直在使用HttpClient,版本是3.6的,负责维护的同事离职后,就没有更新过,在这次项目改版中决定对这块进行升级到4.3版本,查阅了一些资料写了部分出来,还不是很完善,后期有时间在更新
![]()
这里对httpclient如何获取原页面编码进行了一些处理,感觉还不是很完善,后期想到在处理吧
具体的理论东西就不说了,代码中注释都有一些说明。
直接上代码
- package com.boryou.module.load;
-
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InterruptedIOException;
- import java.net.MalformedURLException;
- import java.net.URI;
- import java.net.URISyntaxException;
- import java.net.URL;
- import java.net.UnknownHostException;
- import java.nio.charset.Charset;
- import java.security.KeyManagementException;
- import java.security.KeyStoreException;
- import java.security.NoSuchAlgorithmException;
- import java.security.cert.CertificateException;
- import java.security.cert.X509Certificate;
- import java.util.Random;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import java.util.zip.GZIPInputStream;
-
- import javax.net.ssl.SSLContext;
- import javax.net.ssl.SSLException;
-
- import org.apache.http.Header;
- import org.apache.http.HeaderElement;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpEntityEnclosingRequest;
- import org.apache.http.HttpHost;
- import org.apache.http.HttpRequest;
- import org.apache.http.client.ClientProtocolException;
- import org.apache.http.client.HttpRequestRetryHandler;
- import org.apache.http.client.config.CookieSpecs;
- import org.apache.http.client.config.RequestConfig;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.client.protocol.HttpClientContext;
- import org.apache.http.conn.ConnectTimeoutException;
- import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
- import org.apache.http.entity.ContentType;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.impl.client.LaxRedirectStrategy;
- import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
- import org.apache.http.protocol.HttpContext;
- import org.apache.http.ssl.SSLContextBuilder;
- import org.apache.http.ssl.TrustStrategy;
- import org.apache.http.util.ByteArrayBuffer;
-
- import com.boryou.constant.Constant;
- import com.boryou.util.Common;
-
-
- public class InfoLoad {
-
- private PoolingHttpClientConnectionManager httpClientConnectionManager = null;
-
-
-
- private static final InfoLoad infoLoad = new InfoLoad();
-
- public static InfoLoad getInfoLoadInstance(){
- return infoLoad;
- }
-
-
- private InfoLoad(){
-
- initHttpClient();
- }
-
- public void initHttpClient(){
-
- httpClientConnectionManager = new PoolingHttpClientConnectionManager();
-
- httpClientConnectionManager.setMaxTotal(Constant.HTTPCLIENT_CONNECTION_COUNT);
-
- httpClientConnectionManager.setDefaultMaxPerRoute(Constant.HTTPCLIENT_MAXPERROUTE_COUNT);
- }
-
- HttpRequestRetryHandler myRetryHandler = new HttpRequestRetryHandler() {
- public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
- if (executionCount >= 3) {
-
- return false;
- }
- if (exception instanceof InterruptedIOException) {
-
- return false;
- }
- if (exception instanceof UnknownHostException) {
-
- return false;
- }
- if (exception instanceof ConnectTimeoutException) {
-
- return false;
- }
- if (exception instanceof SSLException) {
-
- return false;
- }
- HttpClientContext clientContext = HttpClientContext.adapt(context);
- HttpRequest request = clientContext.getRequest();
- boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
- if (idempotent) {
-
- return true;
- }
- return false;
- }
- };
-
- public CloseableHttpClient getHttpClient(){
-
- RequestConfig requestConfig = RequestConfig.custom()
- .setConnectTimeout(Constant.HTTPCLIENT_CONNECT_TIMEOUT)
- .setSocketTimeout(Constant.HTTPCLIENT_SOCKET_TIMEOUT)
- .setCookieSpec(CookieSpecs.BEST_MATCH).build();
-
- LaxRedirectStrategy redirectStrategy = new LaxRedirectStrategy();
-
- CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(httpClientConnectionManager)
- .setDefaultRequestConfig(requestConfig)
- .setRedirectStrategy(redirectStrategy)
- .setRetryHandler(myRetryHandler)
- .build();
- return httpClient;
- }
-
-
- public static String loadForString(String urlString, int type){
- String src = "";
- if(null==urlString || urlString.isEmpty() || !urlString.startsWith("http")){
- return src;
- }
-
- CloseableHttpResponse response = null;
- HttpGet httpGet = null;
- urlString = urlString.trim();
-
- try {
- URL url = new URL(urlString);
- URI uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null);
- httpGet = new HttpGet(uri);
-
-
- httpGet.addHeader("Accept","*/*");
- httpGet.addHeader("Connection","keep-alive");
- httpGet.addHeader("Accept-Encoding", "gzip, deflate");
-
-
- Random random = new Random();
- int randomInt = random.nextInt(4);
- System.err.println(randomInt);
-
- httpGet.addHeader("User-Agent", Constant.USER_AGENT[randomInt]);
-
-
- try {
- if(urlString.startsWith("https")){
- System.setProperty ("jsse.enableSNIExtension", "false");
- response = createSSLClientDefault().execute(httpGet);
- }else{
- response = infoLoad.getHttpClient().execute(httpGet);
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
-
-
- int statuCode = response.getStatusLine().getStatusCode();
-
- switch (statuCode){
- case 200:
-
- HttpEntity entity = response.getEntity();
- /**
- * 仿浏览器获取网页编码
- * 浏览器是先从content-type的charset(响应头信息)中获取编码,
- * 如果获取不了,则会从meta(HTML里的代码)中获取charset的编码值
- */
-
- String charset = null;
- ContentType contentType = null;
- contentType = ContentType.getOrDefault(entity);
- Charset charsets = contentType.getCharset();
- if(null != charsets){
- charset = charsets.toString();
- }
-
- Header header = entity.getContentEncoding();
- boolean isGzip = false;
- if(null != header){
- for(HeaderElement headerElement : header.getElements()){
- if(headerElement.getName().equalsIgnoreCase("gzip")){
- isGzip = true;
- }
- }
- }
-
- InputStream inputStream = entity.getContent();
- ByteArrayBuffer buffer = new ByteArrayBuffer(4096);
- byte[] tmp = new byte[4096];
- int count;
- if(isGzip){
- GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
- while((count=gzipInputStream.read(tmp)) != -1){
- buffer.append(tmp, 0, count);
- }
- }else{
- while((count=inputStream.read(tmp)) != -1){
- buffer.append(tmp, 0, count);
- }
- }
-
- if(null==charset || "".equals(charset) || "null".equals(charset)
- || "zh-cn".equalsIgnoreCase(charset)){
- charset = getCharsetFromMetaTag(buffer, urlString);
- }
-
- src = new String(buffer.toByteArray(), charset);
-
- src = replaceStr(src);
-
- src = Common.decodeUnicode(src);
- break;
- case 400:
- System.out.println("下载400错误代码,请求出现语法错误" + urlString);
-
-
- break;
- case 403:
- System.out.println("下载403错误代码,资源不可用" + urlString);
-
- break;
- case 404:
- System.out.println("下载404错误代码,无法找到指定资源地址" + urlString);
-
- break;
- case 503:
- System.out.println("下载503错误代码,服务不可用" + urlString);
-
- break;
- case 504:
- System.out.println("下载504错误代码,网关超时" + urlString);
-
- break;
- }
-
- } catch (MalformedURLException e) {
-
- e.printStackTrace();
- } catch (URISyntaxException e) {
-
- e.printStackTrace();
- } catch (ClientProtocolException e) {
-
- e.printStackTrace();
- } catch (IOException e) {
-
- e.printStackTrace();
- } finally{
- if(response != null){
- try {
- response.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- httpGet.abort();
-
- }
-
- return src;
- }
-
- public static CloseableHttpClient createSSLClientDefault(){
- try {
- SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy(){
-
- public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
- return true;
- }}).build();
-
- SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext);
-
- return HttpClients.custom().setSSLSocketFactory(sslsf).build();
-
- } catch (KeyManagementException e) {
- e.printStackTrace();
- } catch (NoSuchAlgorithmException e) {
- e.printStackTrace();
- } catch (KeyStoreException e) {
- e.printStackTrace();
- }
-
- return HttpClients.createDefault();
- }
-
- public static String getCharsetFromMetaTag(ByteArrayBuffer buffer,String url){
- String charset = null;
- String regEx = Constant.CHARSET_REGEX;
- Pattern p = Pattern.compile(regEx,
- Pattern.CASE_INSENSITIVE);
- Matcher m = p.matcher(new String(buffer.toByteArray()));
- boolean result = m.find();
- if (result) {
- if (m.groupCount() == 1) {
- charset = m.group(1);
- }
- System.err.println("网页 中的编码:" + charset + "\t url:" + url);
- } else {
-
- charset = "gbk";
- System.out.println("字符编码未匹配到 : " + url);
- }
- return charset;
- }
-
- public static String replaceStr(String src){
- if (src == null || "".equals(src)) return null;
- src = src.replaceAll("<!--", "");
- src = src.replaceAll("-->", "");
- src = src.replaceAll("<", "<");
- src = src.replaceAll(">", ">");
- src = src.replaceAll(""", "\"");
- src = src.replaceAll(" ", " ");
- src = src.replaceAll("&", "&");
- return src;
- }
-
-
- public static void main(String[] args) {
-
- Random random = new Random();
- int randomInt = random.nextInt(4);
- System.out.println(randomInt);
- InfoLoad.getInfoLoadInstance().loadForString("http://www.xinli001.com/info", 0);
- }
-
- }
这里用到了IO常用的工具类IOUtils,详细了解可以参考这篇文章[file]IO常用工具类IOUtils(Java读文件、写文件、打Zip包)http://www.bieryun.com/1003.html