Unwanted HTML elements in your Sitecore richtext fields
Custom save events to the rescue!
I have been planning this blog for years maybe but kept postponing, but finally it is here after similar requests from our great Sitecore community.
The problem I have solved in my projects in the past is the following:
When editing rich text fields, the default behavior of the Sitecore Telerik control is to insert a bit too many HTML elements around your content. For example <p> tags you don't want there, <p> tags or divs around image that you embed in the rich text field etc.
At some point I was so annoyed by this and the bad HTML it produced, I decided to customize the item save event to clean up the resulting HTML. Below you will find my code, I hope you find it useful. Be aware this code has only been tested on Sitecore 7.x. While I expect it to also work for 8+, you better test for yourself whether nothing breaks.
Also, extend to your own need, to create your own neat and clean HTML.
/// /// Don't forget following config file patch for Sitecore: /// <configuration> /// <sitecore> /// <events> /// <event name="item:saved"> /// <handler type="YourNameSpace.Sitecore.Events.RichTextEditorSaveEvent, YourNameSpace.Sitecore" method="OnItemSaving"> /// <database>master</database> /// </handler> /// </event> /// </events> /// </sitecore> /// </configuration> using Sitecore.Data.Items; using Sitecore.Events; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using Sitecore.Collections; using Sitecore.SecurityModel; using Sitecore.Data.Fields; using System.Collections; using System.Xml; using Sitecore.Data; using HtmlAgilityPack; namespace YourNameSpace.Sitecore.Events { /// <summary> /// This on save event fires for all items, but only for rich text fields it executes logic. In this case it cleans up the rich text field's html. /// </summary> public class RichTextEditorSaveEvent { private static readonly SynchronizedCollection<ID> MProcess = new SynchronizedCollection<ID>(); public string Database { get; set; } public void OnItemSaving(object sender, EventArgs args) { var item = Event.ExtractParameter(args, 0) as Item; if (item == null) { return; } if ((item.Database != null && String.Compare(item.Database.Name, this.Database) != 0) || MProcess.Contains(item.ID)) { return; } MProcess.Add(item.ID); try { foreach (Field field in item.Fields) { if (!field.TypeKey.Equals("rich text", StringComparison.InvariantCultureIgnoreCase)) { continue; } var content = field.Value; if (!string.IsNullOrEmpty(content)) { content = content.Trim(); try { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(content); RemoveEmptyPTags(htmlDocument); RemovePTagAroundImages(htmlDocument); MoveSpanClassAndStyleToPTagIfIsDirectAndOnlyChild(htmlDocument); content = htmlDocument.DocumentNode.InnerHtml; } catch (Exception) { //Apparently no html or not valid, in this case wrap in <p> if (!content.Contains("<p>")) { content = "<p>" + content + "</p>"; } } using (new SecurityDisabler()) { item.Editing.BeginEdit(); field.Value = content; item.Editing.EndEdit(); } } } } catch (Exception) { } finally { MProcess.Remove(item.ID); } } /// <summary> /// Moves span attributes to parent <p></p> tag moves span contents to parent as well, then removes span. This to get cleaner HTML. /// </summary> /// <param name="content"></param> private void MoveSpanClassAndStyleToPTagIfIsDirectAndOnlyChild(HtmlDocument content) { HtmlNodeCollection spanNodes = content.DocumentNode.SelectNodes("//span"); if (spanNodes != null && spanNodes.Count > 0) { foreach (HtmlNode spanTag in spanNodes) { if (spanTag.ParentNode != null && spanTag.ParentNode.Name == "p" && spanTag.ParentNode.ChildNodes.Count == 1) { string spanContents = spanTag.InnerHtml; foreach (HtmlAttribute attr in spanTag.Attributes) { if (spanTag.ParentNode.Attributes != null && spanTag.ParentNode.Attributes[attr.Name] != null) { spanTag.ParentNode.Attributes[attr.Name].Value += !string.IsNullOrEmpty(spanTag.ParentNode.Attributes[attr.Name].Value) ? " " + attr.Value : attr.Value; } else { spanTag.ParentNode.Attributes.Add(attr.Name, attr.Value); } } HtmlNode parent = spanTag.ParentNode; parent.RemoveChild(spanTag); parent.InnerHtml += spanContents; } } } } /// <summary> /// Images can get empty <p></p> tags around them when content editor inputs a linebreak after inserting the image. This is undesirable. This method removes such tags and sets image on parent /// </summary> /// <param name="content"></param> private void RemovePTagAroundImages(HtmlDocument content) { HtmlNodeCollection imgNodes = content.DocumentNode.SelectNodes("//img"); if (imgNodes != null && imgNodes.Count > 0) { foreach (HtmlNode imgTag in imgNodes) { if (imgTag.ParentNode.Name == "p") { string xmlToPreserve = imgTag.ParentNode.InnerHtml; HtmlNode grantParent = imgTag.ParentNode.ParentNode; grantParent.RemoveChild(imgTag.ParentNode); grantParent.InnerHtml += xmlToPreserve; } } } } /// <summary> /// Cleans html with empty <p></p> tags. Content editor should use <br/> line breaks instead, or spacing should be realized with CSS. /// </summary> /// <param name="content"></param> private void RemoveEmptyPTags(HtmlDocument content) { HtmlNodeCollection pNodes = content.DocumentNode.SelectNodes("//p"); if (pNodes != null && pNodes.Count > 0) { foreach (HtmlNode pTag in pNodes) { if (string.IsNullOrWhiteSpace(pTag.InnerHtml) || pTag.InnerHtml == " " || pTag.InnerHtml == "\n" || pTag.InnerHtml == "\n\n") { pTag.ParentNode.RemoveChild(pTag); } } } } } }
Have extensions you want to share to get better HTML? Let me know!
Reacties
Een reactie posten