Unwanted HTML elements in your Sitecore richtext fields
Custom save events to the rescue!
I have been planning this blog for years maybe but kept postponing, but finally it is here after similar requests from our great Sitecore community.
The problem I have solved in my projects in the past is the following:
When editing rich text fields, the default behavior of the Sitecore Telerik control is to insert a bit too many HTML elements around your content. For example <p> tags you don't want there, <p> tags or divs around image that you embed in the rich text field etc.
At some point I was so annoyed by this and the bad HTML it produced, I decided to customize the item save event to clean up the resulting HTML. Below you will find my code, I hope you find it useful. Be aware this code has only been tested on Sitecore 7.x. While I expect it to also work for 8+, you better test for yourself whether nothing breaks.
Also, extend to your own need, to create your own neat and clean HTML.
///
/// Don't forget following config file patch for Sitecore:
/// <configuration>
/// <sitecore>
/// <events>
/// <event name="item:saved">
/// <handler type="YourNameSpace.Sitecore.Events.RichTextEditorSaveEvent, YourNameSpace.Sitecore" method="OnItemSaving">
/// <database>master</database>
/// </handler>
/// </event>
/// </events>
/// </sitecore>
/// </configuration>
using Sitecore.Data.Items;
using Sitecore.Events;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Sitecore.Collections;
using Sitecore.SecurityModel;
using Sitecore.Data.Fields;
using System.Collections;
using System.Xml;
using Sitecore.Data;
using HtmlAgilityPack;
namespace YourNameSpace.Sitecore.Events
{
/// <summary>
/// This on save event fires for all items, but only for rich text fields it executes logic. In this case it cleans up the rich text field's html.
/// </summary>
public class RichTextEditorSaveEvent
{
private static readonly SynchronizedCollection<ID> MProcess = new SynchronizedCollection<ID>();
public string Database
{
get;
set;
}
public void OnItemSaving(object sender, EventArgs args)
{
var item = Event.ExtractParameter(args, 0) as Item;
if (item == null)
{
return;
}
if ((item.Database != null && String.Compare(item.Database.Name, this.Database) != 0) || MProcess.Contains(item.ID))
{
return;
}
MProcess.Add(item.ID);
try
{
foreach (Field field in item.Fields)
{
if (!field.TypeKey.Equals("rich text", StringComparison.InvariantCultureIgnoreCase))
{
continue;
}
var content = field.Value;
if (!string.IsNullOrEmpty(content))
{
content = content.Trim();
try
{
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(content);
RemoveEmptyPTags(htmlDocument);
RemovePTagAroundImages(htmlDocument);
MoveSpanClassAndStyleToPTagIfIsDirectAndOnlyChild(htmlDocument);
content = htmlDocument.DocumentNode.InnerHtml;
}
catch (Exception)
{
//Apparently no html or not valid, in this case wrap in <p>
if (!content.Contains("<p>"))
{
content = "<p>" + content + "</p>";
}
}
using (new SecurityDisabler())
{
item.Editing.BeginEdit();
field.Value = content;
item.Editing.EndEdit();
}
}
}
}
catch (Exception)
{
}
finally
{
MProcess.Remove(item.ID);
}
}
/// <summary>
/// Moves span attributes to parent <p></p> tag moves span contents to parent as well, then removes span. This to get cleaner HTML.
/// </summary>
/// <param name="content"></param>
private void MoveSpanClassAndStyleToPTagIfIsDirectAndOnlyChild(HtmlDocument content)
{
HtmlNodeCollection spanNodes = content.DocumentNode.SelectNodes("//span");
if (spanNodes != null && spanNodes.Count > 0)
{
foreach (HtmlNode spanTag in spanNodes)
{
if (spanTag.ParentNode != null && spanTag.ParentNode.Name == "p" && spanTag.ParentNode.ChildNodes.Count == 1)
{
string spanContents = spanTag.InnerHtml;
foreach (HtmlAttribute attr in spanTag.Attributes)
{
if (spanTag.ParentNode.Attributes != null && spanTag.ParentNode.Attributes[attr.Name] != null)
{
spanTag.ParentNode.Attributes[attr.Name].Value += !string.IsNullOrEmpty(spanTag.ParentNode.Attributes[attr.Name].Value) ? " " + attr.Value : attr.Value;
}
else
{
spanTag.ParentNode.Attributes.Add(attr.Name, attr.Value);
}
}
HtmlNode parent = spanTag.ParentNode;
parent.RemoveChild(spanTag);
parent.InnerHtml += spanContents;
}
}
}
}
/// <summary>
/// Images can get empty <p></p> tags around them when content editor inputs a linebreak after inserting the image. This is undesirable. This method removes such tags and sets image on parent
/// </summary>
/// <param name="content"></param>
private void RemovePTagAroundImages(HtmlDocument content)
{
HtmlNodeCollection imgNodes = content.DocumentNode.SelectNodes("//img");
if (imgNodes != null && imgNodes.Count > 0)
{
foreach (HtmlNode imgTag in imgNodes)
{
if (imgTag.ParentNode.Name == "p")
{
string xmlToPreserve = imgTag.ParentNode.InnerHtml;
HtmlNode grantParent = imgTag.ParentNode.ParentNode;
grantParent.RemoveChild(imgTag.ParentNode);
grantParent.InnerHtml += xmlToPreserve;
}
}
}
}
/// <summary>
/// Cleans html with empty <p></p> tags. Content editor should use <br/> line breaks instead, or spacing should be realized with CSS.
/// </summary>
/// <param name="content"></param>
private void RemoveEmptyPTags(HtmlDocument content)
{
HtmlNodeCollection pNodes = content.DocumentNode.SelectNodes("//p");
if (pNodes != null && pNodes.Count > 0)
{
foreach (HtmlNode pTag in pNodes)
{
if (string.IsNullOrWhiteSpace(pTag.InnerHtml) || pTag.InnerHtml == " " || pTag.InnerHtml == "\n" || pTag.InnerHtml == "\n\n")
{
pTag.ParentNode.RemoveChild(pTag);
}
}
}
}
}
}
Have extensions you want to share to get better HTML? Let me know!
Reacties
Een reactie posten