Professional Applications Programmers/Consultants برمجة واستشارات تطبيقات الإنترنت
Skip Navigation LinksHome » Code Library » Html

Public general use code classes and xml files that we've compiled and used over the years:

Handle HTML encoding, decoding functions.

   1:  using System;
   2:  using System.Web;
   3:  using System.Xml;
   4:  using System.Text;
   5:  using System.Text.RegularExpressions;
   6:  using System.Globalization;
   7:   
   8:  namespace Ia.Cl.Model
   9:  {
  10:      ////////////////////////////////////////////////////////////////////////////
  11:   
  12:      /// <summary publish="true">
  13:      /// Handle HTML encoding, decoding functions.
  14:      /// </summary>
  15:      /// <remarks> 
  16:      /// Copyright � 2001-2018 Jasem Y. Al-Shamlan (info@ia.com.kw), Internet Applications - Kuwait. All Rights Reserved.
  17:      ///
  18:      /// This library is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
  19:      /// the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  20:      ///
  21:      /// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  22:      /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  23:      /// 
  24:      /// You should have received a copy of the GNU General Public License along with this library. If not, see http://www.gnu.org/licenses.
  25:      /// 
  26:      /// Copyright notice: This notice may not be removed or altered from any source distribution.
  27:      /// </remarks> 
  28:      public class Html
  29:      {
  30:          private static readonly Regex regexBetweenTags = new Regex(@">\s+<", RegexOptions.Compiled);
  31:          private static readonly Regex regexLineBreaks = new Regex(@"\n\s+", RegexOptions.Compiled);
  32:   
  33:          ////////////////////////////////////////////////////////////////////////////
  34:   
  35:          /// <summary>
  36:          ///
  37:          /// </summary>
  38:          public Html() { }
  39:   
  40:          ////////////////////////////////////////////////////////////////////////////
  41:   
  42:          ///<summary>
  43:          ///
  44:          /// </summary>
  45:          /// <param name="s"></param>
  46:          /// <returns></returns>
  47:          /// <remark>http://www.west-wind.com/weblog/posts/2009/Feb/05/Html-and-Uri-String-Encoding-without-SystemWeb</remark>
  48:          public static string HtmlEncode(string s)
  49:          {
  50:  #if WINDOWS_FORM
  51:   
  52:              if (s == null) return null;
  53:   
  54:              StringBuilder sb = new StringBuilder(s.Length);
  55:   
  56:              int len = s.Length;
  57:   
  58:              for (int i = 0; i < len; i++)
  59:              {
  60:                  switch (s[i])
  61:                  {
  62:                      case '<': sb.Append("&lt;"); break;
  63:                      case '>': sb.Append("&gt;"); break;
  64:                      case '"': sb.Append("&quot;"); break;
  65:                      case '&': sb.Append("&amp;"); break;
  66:                      default:
  67:                          if (s[i] > 159)
  68:                          {
  69:                              // decimal numeric entity
  70:                              sb.Append("&#");
  71:                              sb.Append(((int)s[i]).ToString(CultureInfo.InvariantCulture));
  72:                              sb.Append(";");
  73:                          }
  74:                          else sb.Append(s[i]);
  75:                          break;
  76:                  }
  77:              }
  78:   
  79:              return sb.ToString();
  80:  #else
  81:              return HttpUtility.HtmlEncode(s);
  82:  #endif
  83:          }
  84:   
  85:          ////////////////////////////////////////////////////////////////////////////
  86:   
  87:          ///<summary>
  88:          ///
  89:          /// </summary>
  90:          public static string HtmlDecode(string s)
  91:          {
  92:  #if WINDOWS_FORM
  93:              s = s.Replace("&lt;","<");
  94:              s = s.Replace("&gt;",">");
  95:              s = s.Replace("&quot;",@"""");
  96:              s = s.Replace("&amp;","&");
  97:   
  98:              return s;
  99:  #else
 100:              return HttpUtility.HtmlDecode(s);
 101:  #endif
 102:          }
 103:   
 104:          ////////////////////////////////////////////////////////////////////////////
 105:   
 106:          /// <summary>
 107:          ///
 108:          /// </summary>
 109:          public static string Encode(string s)
 110:          {
 111:              s = HtmlEncode(s);
 112:   
 113:              // database requirement:
 114:              s = s.Replace(@"'", @"_#039_");
 115:              s = s.Replace(@"?", @"_#063_");
 116:   
 117:              return s;
 118:          }
 119:   
 120:          ////////////////////////////////////////////////////////////////////////////
 121:   
 122:          /// <summary>
 123:          ///
 124:          /// </summary>
 125:          public static string Decode(string s)
 126:          {
 127:              // database requirement:
 128:              s = s.Replace(@"_#063_", @"?");
 129:              s = s.Replace(@"_#039_", @"'");
 130:   
 131:              s = HtmlDecode(s);
 132:   
 133:              return s;
 134:          }
 135:   
 136:          ////////////////////////////////////////////////////////////////////////////
 137:   
 138:          /// <summary>
 139:          ///
 140:          /// </summary>
 141:          public static string DecodeRemoveNLLF(string s)
 142:          {
 143:              // database requirement:
 144:   
 145:              s = s.Replace(@"_#063_", @"?");
 146:              s = s.Replace(@"_#039_", @"'");
 147:   
 148:              s = HtmlDecode(s);
 149:   
 150:              s = s.Replace("\n\r", " ");
 151:              s = s.Replace("\r\n", " ");
 152:              s = s.Replace("\n", " ");
 153:              s = s.Replace("\r", " ");
 154:   
 155:              return s;
 156:          }
 157:   
 158:          ////////////////////////////////////////////////////////////////////////////
 159:   
 160:          /// <summary>
 161:          ///
 162:          /// </summary>
 163:          public static string XmlEncode(string s)
 164:          {
 165:              s = HtmlEncode(s);
 166:   
 167:              s = s.Replace(@"'", @"_#039_");
 168:              s = s.Replace(@"\", @"_#092_");
 169:              s = s.Replace(@"?", @"_#063_");
 170:   
 171:              /*
 172:              &amp;  =  &
 173:              &lt;   =  <
 174:              &gt;   =  >
 175:              &quot; =  "
 176:              &apos; =  '
 177:              */
 178:   
 179:              // XML requirement:
 180:              s = s.Replace("&", "_amp_");
 181:              s = s.Replace(">", "_gt_");
 182:              s = s.Replace("<", "_lt_");
 183:   
 184:              return s;
 185:          }
 186:   
 187:          ////////////////////////////////////////////////////////////////////////////
 188:   
 189:          /// <summary>
 190:          ///
 191:          /// </summary>
 192:          public static string XmlDecode(string s)
 193:          {
 194:              // XML requirement
 195:              s = s.Replace("_gt_", ">");
 196:              s = s.Replace("_lt_", "<");
 197:              s = s.Replace("_amp_", "&");
 198:   
 199:              s = s.Replace(@"_#039_", @"'");
 200:              s = s.Replace(@"_#092_", @"\");
 201:              s = s.Replace(@"_#063_", @"?");
 202:   
 203:              s = HtmlDecode(s);
 204:              return s;
 205:          }
 206:   
 207:          ////////////////////////////////////////////////////////////////////////////
 208:   
 209:          /// <summary>
 210:          ///
 211:          /// </summary>
 212:          public static string Code(string code)
 213:          {
 214:              // this displays an HTML code in regular text
 215:              /*
 216:              s=s.Replace("_gt_",">");
 217:              s=s.Replace("_lt_","<");
 218:              s=s.Replace("_amp_","&");
 219:   
 220:              s=s.Replace(@"_#039_",@"'");
 221:              s=s.Replace(@"_#092_",@"\");
 222:              s=s.Replace(@"_#063_",@"?");
 223:              */
 224:   
 225:              code = HtmlEncode(code);
 226:              return code;
 227:          }
 228:   
 229:          ////////////////////////////////////////////////////////////////////////////
 230:   
 231:          /// <summary>
 232:          ///
 233:          /// </summary>
 234:          public static string StripHtml(string source)
 235:          {
 236:              try
 237:              {
 238:                  string result;
 239:   
 240:                  // Remove HTML Development formatting
 241:                  // Replace line breaks with space
 242:                  // because browsers inserts space
 243:                  result = source.Replace("\r", " ");
 244:   
 245:                  // Replace line breaks with space
 246:                  // because browsers inserts space
 247:                  result = result.Replace("\n", " ");
 248:   
 249:                  // Remove step-formatting
 250:                  result = result.Replace("\t", string.Empty);
 251:   
 252:                  // Remove repeating speces becuase browsers ignore them
 253:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " ");
 254:   
 255:                  // Remove the header (prepare first by clearing attributes)
 256:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>", "<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 257:   
 258:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 259:   
 260:                  result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 261:   
 262:                  // remove all scripts (prepare first by clearing attributes)
 263:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>", "<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 264:   
 265:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 266:   
 267:                  //result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>)([^(<script>\.</script>)])*(</script>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 268:   
 269:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 270:   
 271:                  // remove all styles (prepare first by clearing attributes)
 272:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>", "<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 273:   
 274:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 275:   
 276:                  result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 277:   
 278:                  // insert tabs in spaces of <td> tags
 279:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 280:   
 281:                  // insert line breaks in places of <BR> and <LI> tags
 282:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 283:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 284:   
 285:                  // insert line paragraphs (double line breaks) in place
 286:                  // if <P>, <DIV> and <TR> tags
 287:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 288:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 289:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 290:   
 291:                  // Remove remaining tags like <a>, links, images, // comments etc - anything thats enclosed inside < >
 292:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 293:   
 294:                  // replace special characters:
 295:                  result = System.Text.RegularExpressions.Regex.Replace(result, @" ", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 296:   
 297:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&bull;", " * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 298:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&lsaquo;", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 299:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&rsaquo;", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 300:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&trade;", "(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 301:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&frasl;", "/", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 302:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 303:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&gt;", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 304:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&copy;", "(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 305:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&reg;", "(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 306:   
 307:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&nbsp;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 308:   
 309:                  // Remove all others. More can be added, see
 310:                  // http://hotwired.lycos.com/webmonkey/reference/special_characters/
 311:                  result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 312:   
 313:                  // for testng
 314:                  //System.Text.RegularExpressions.Regex.Replace(result, 
 315:                  //      this.txtRegex.Text,string.Empty, 
 316:                  //      System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 317:   
 318:                  // make line breaking consistent
 319:                  result = result.Replace("\n", "\r");
 320:   
 321:                  // Remove extra line breaks and tabs:
 322:                  // replace over 2 breaks with 2 and over 4 tabs with 4. 
 323:                  // Prepare first to remove any whitespaces inbetween
 324:                  // the escaped characters and remove redundant tabs inbetween linebreaks
 325:                  result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 326:                  result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", "\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 327:                  result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", "\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 328:                  result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 329:   
 330:                  // Remove redundant tabs
 331:                  result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 332:                  
 333:                  // Remove multible tabs followind a linebreak with just one tab
 334:                  result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
 335:                  
 336:                  // Initial replacement target string for linebreaks
 337:                  string breaks = "\r\r\r";
 338:                  // Initial replacement target string for tabs
 339:                  string tabs = "\t\t\t\t\t";
 340:   
 341:                  for (int index = 0; index < result.Length; index++)
 342:                  {
 343:                      result = result.Replace(breaks, "\r\r");
 344:                      result = result.Replace(tabs, "\t\t\t\t");
 345:                      breaks = breaks + "\r";
 346:                      tabs = tabs + "\t";
 347:                  }
 348:   
 349:                  // Thats it.
 350:                  return result;
 351:   
 352:              }
 353:              catch
 354:              {
 355:                  //MessageBox.Show("Error");
 356:                  return null;
 357:              }
 358:          }
 359:   
 360:          ////////////////////////////////////////////////////////////////////////////
 361:   
 362:          /// <summary>
 363:          ///
 364:          /// </summary>
 365:          public static string TextToHtml(string source)
 366:          {
 367:              // clean regular text format pages and return an equivalent html format
 368:   
 369:              string s;
 370:   
 371:              s = Decode(source);
 372:              //s = global::Ia.Cl.Model.Html.Html_Strip(s);
 373:              s = Regex.Replace(s, @"\.", @". ");
 374:              s = Regex.Replace(s, @"[ ]+", @" ");
 375:              s = s.Replace("\r", "");
 376:              s = s.Replace("\n+", "\n");
 377:              //s = "<p>" + s.Replace("\n", "</p>\n<p>") + "</p>";
 378:              /*
 379:              s = s.Replace("\n", "</p>\n<p>");
 380:   
 381:              // clean up
 382:              u = sb.ToString();
 383:              u = Regex.Replace(u, @"^\s+", "");
 384:              u = Regex.Replace(u, @">\s+", ">");
 385:              u = Regex.Replace(u, @"\s+<", "<");
 386:              u = Regex.Replace(u, @"\s+", " ");
 387:              u = Regex.Replace(u, @"\n+", @"<br/>"); // keep newlines
 388:              //u = Regex.Replace(u, @"</ul>(.+?)</ul>", "</ul><p>$1</p></ul>");
 389:              //u = Regex.Replace(u, @"</ul>(.+?)</p>", "</ul><p>$1</p></p>");
 390:              //u = u.Replace(@"�", "<p/>&nbsp;&nbsp;&nbsp;�&nbsp;");
 391:              */
 392:   
 393:              return s;
 394:          }
 395:   
 396:          ////////////////////////////////////////////////////////////////////////////
 397:   
 398:          /// <summary>
 399:          ///
 400:          /// </summary>
 401:          public static string TextToHtml2(string source)
 402:          {
 403:              // clean regular text format pages and return an equivalent html format
 404:   
 405:              string s;
 406:   
 407:              s = Decode(source);
 408:              //s = global::Ia.Cl.Model.Html.Html_Strip(s);
 409:              s = Regex.Replace(s, @"\.", @". ");
 410:              s = Regex.Replace(s, @"[ ]+", @" ");
 411:              s = s.Replace("\r", "");
 412:              s = s.Replace("\n+", "\n");
 413:              s = "<p>" + s.Replace("\n", "</p>\n<p>") + "</p>";
 414:   
 415:              /*
 416:              s = s.Replace("\n", "</p>\n<p>");
 417:   
 418:              // clean up
 419:              u = sb.ToString();
 420:              u = Regex.Replace(u, @"^\s+", "");
 421:              u = Regex.Replace(u, @">\s+", ">");
 422:              u = Regex.Replace(u, @"\s+<", "<");
 423:              u = Regex.Replace(u, @"\s+", " ");
 424:              u = Regex.Replace(u, @"\n+", @"<br/>"); // keep newlines
 425:              //u = Regex.Replace(u, @"</ul>(.+?)</ul>", "</ul><p>$1</p></ul>");
 426:              //u = Regex.Replace(u, @"</ul>(.+?)</p>", "</ul><p>$1</p></p>");
 427:              //u = u.Replace(@"�", "<p/>&nbsp;&nbsp;&nbsp;�&nbsp;");
 428:              */
 429:   
 430:              return s;
 431:          }
 432:   
 433:          ////////////////////////////////////////////////////////////////////////////
 434:   
 435:          /// <summary>
 436:          ///
 437:          /// </summary>
 438:          public static string TextToHtmlAndOl_Ul_LiToBr(string source)
 439:          {
 440:              // clean regular text format pages and return an equivalent html format
 441:   
 442:              string s;
 443:   
 444:              s = Decode(source);
 445:              s = Regex.Replace(s, @"\.", @". ");
 446:              s = Regex.Replace(s, @"[ ]+", @" ");
 447:              s = s.Replace("\r", "");
 448:              s = s.Replace("\n+", "\n");
 449:   
 450:              s = s.Replace("<ol>", "<br/> <br/>");
 451:              s = s.Replace("</ol>", "");
 452:              s = s.Replace("<ul>", "<br/> <br/>");
 453:              s = s.Replace("</ul>", "");
 454:              s = s.Replace("<li>", "-");
 455:              s = s.Replace("</li>", "<br/>");
 456:   
 457:              return s;
 458:          }
 459:   
 460:          ////////////////////////////////////////////////////////////////////////////
 461:   
 462:          /// <summary>
 463:          ///
 464:          /// <see cref="http://madskristensen.net/post/remove-whitespace-from-your-pages"/>
 465:          /// </summary>
 466:          public static string RemoveWhitespaceFromHtml(string html)
 467:          {
 468:              // for now we will skip if page has <pre>
 469:   
 470:              if (!html.Contains("<pre>"))
 471:              {
 472:                  html = regexBetweenTags.Replace(html, "> <");
 473:                  html = regexLineBreaks.Replace(html, string.Empty);
 474:              }
 475:   
 476:              return html.Trim();
 477:          }
 478:   
 479:          ////////////////////////////////////////////////////////////////////////////
 480:          ////////////////////////////////////////////////////////////////////////////
 481:      }
 482:  }