Thursday, December 5, 2013

How to get the substring of a string that contains HTML tags using C# .Net


public string HtmlSubstring(string html, int maxlength)
        {
            //initialize regular expressions
            string htmltag = "</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>";

            //match all html start and end tags, otherwise get each character one by one..
            var expression = new Regex(string.Format("({0})|(.?)", htmltag));
            MatchCollection matches = expression.Matches(html);

            int i = 0;
            StringBuilder content = new StringBuilder();
            foreach (Match match in matches)
            {
                if (match.Value.Length == 1 && i < maxlength)
                {
                    content.Append(match.Value);
                    i++;
                }
                //the match contains a tag
                else if (match.Value.Length > 1 && i < maxlength)
                    content.Append(match.Value);
            }

            return Regex.Replace(content.ToString(), string.Empty, string.Empty);
        }

No comments: