首页 » 网络编程 » Asp.net

asp.net 利用HttpWebRequest自动获取网页编码并获取网页源代码

Asp.net 2021-12-22
     /// <summary> /// 获取源代码 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetHtml(string url, Encoding encoding)
    {
        HttpWebRequest request = null;
        HttpWebResponse response = null;
        StreamReader reader = null; try {
            request = (HttpWebRequest)WebRequest.Create(url);
            request.Timeout = 20000;
            request.AllowAutoRedirect = false;

            response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
            { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                    reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding); else reader = new StreamReader(response.GetResponseStream(), encoding); string html = reader.ReadToEnd(); return html;
            }
        } catch {
        } finally { if (response != null)
            {
                response.Close();
                response = null;
            } if (reader != null)
                reader.Close(); if (request != null)
                request = null;

        } return string.Empty;
    }
    public static string GetEncoding(string url)
    {
        HttpWebRequest request = null;
        HttpWebResponse response = null;
        StreamReader reader = null; try {
            request = (HttpWebRequest)WebRequest.Create(url);
            request.Timeout = 20000;
            request.AllowAutoRedirect = false;

            response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
            { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                    reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)); else reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd();

                Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)"); if (reg_charset.IsMatch(html))
                { return reg_charset.Match(html).Groups["charset"].Value;
                } else if (response.CharacterSet != string.Empty)
                { return response.CharacterSet;
                } else return Encoding.Default.BodyName;
            }
        } catch {
        } finally { if (response != null)
            {
                response.Close();
                response = null;
            } if (reader != null)
                reader.Close(); if (request != null)
                request = null;

        }
    }
 using System; using System.Net; using System.Text; using System.Text.RegularExpressions; class Program 
 { // 获取网页的HTML内容,根据网页的charset自动判断Encoding  static string GetHtml(string url) 
   { return GetHtml(url, null); 
   } // 获取网页的HTML内容,指定Encoding  static string GetHtml(string url, Encoding encoding) 
   { byte[] buf = new WebClient().DownloadData(url); if (encoding != null) return encoding.GetString(buf); string html = Encoding.UTF8.GetString(buf); 
     encoding = GetEncoding(html); if (encoding == null || encoding == Encoding.UTF8) return html; return encoding.GetString(buf); 
   } // 根据网页的HTML内容提取网页的Encoding  static Encoding GetEncoding(string html) 
   { string pattern = @"(?i)\bcharset=(? <charset>[-a-zA-Z_0-9]+)"; string charset = Regex.Match(html, pattern).Groups["charset"].Value; try { return Encoding.GetEncoding(charset); } catch (ArgumentException) { return null; } 
   } // 根据网页的HTML内容提取网页的Title  static string GetTitle(string html) 
   { string pattern = @"(?si) <title(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(? <title>.*?) </title>"; return Regex.Match(html, pattern).Groups["title"].Value.Trim(); 
   } // 打印网页的Encoding和Title  static void PrintEncodingAndTitle(string url) 
   { string html = GetHtml(url); 
     Console.WriteLine("[{0}] [{1}]", GetEncoding(html), GetTitle(html)); 
   } // 程序入口  static void Main() 
   { 
     PrintEncodingAndTitle("http://www.msdn.net/"); 
     PrintEncodingAndTitle("http://www.cnblogs.com/"); 
     PrintEncodingAndTitle("http://www.cnblogs.com/skyiv/"); 
     PrintEncodingAndTitle("http://www.csdn.net/"); 
     PrintEncodingAndTitle("http://news.163.com/"); 
   } 
 } /* 程序输出: 
 [] [MSDN: Microsoft Developer Network] 
 [System.Text.UTF8Encoding] [www.580doc.com - 程序员的网上家园] 
 [System.Text.UTF8Encoding] [空间/IV - www.580doc.com] 
 [System.Text.UTF8Encoding] [CSDN.NET - 中国最大的IT技术社区,为IT专业技术人员提供最全面的信息传播和服务平台] 
 [System.Text.DBCSCodePageEncoding] [新闻中心_网易新闻] */

 


上一篇:Asp.NET 根据URL获取整个页面HTML下一篇:.NET6中使用CuteEditor详解
程序园_程序员的世界 Copyright © 2020- www.580doc.com. Some Rights Reserved.