Recently, I was challenged by the task of managing the overflow of the post on this site, for example in the following post: Visual Studio 2005 Class Diagram for Web Site. The problem is that there is no easy way to shorten the text and add ellipsis to the end, because the actual html of the post has complex structure. So, I decided to parse the code to a graph and work on this graph until it satisfies the condition. In this case, the condition was to have the length of the source not greater than some constant N, for example, N=1000 characters. So, I wrote the following class and validated it using several test cases.
public class HtmlOverflow
{
#region Instance members
private string[] _candidateTags = new string[] { "html", "span", "div", "p" };
#endregion
#region Constructors
public HtmlOverflow()
{
}
public HtmlOverflow(string[] candidateTags)
{
_candidateTags = candidateTags;
}
#endregion
#region Public members
public string ProcessHtmlOverflow(string html, int cutoff)
{
XmlDocument doc = new XmlDocument();
doc.LoadXml(html);
ProcessHtmlOverflowCore(doc, cutoff);
return doc.InnerXml;
}
public string ProcessHtmlOverflow(XmlDocument doc, int cutoff)
{
ProcessHtmlOverflowCore(doc.FirstChild, cutoff);
return doc.InnerXml;
}
public string ProcessXmlOverflow(XmlDocument doc, int cutoff)
{
// Skip xml header
ProcessHtmlOverflowCore(doc.ChildNodes[1], cutoff);
return doc.InnerXml;
}
public void PrintPreorder(XmlNode node, int indent)
{
// If node is null, return
if (node == null)
{
Console.WriteLine("<null>");
return;
}
foreach (XmlNode child in node.ChildNodes)
{
for (int i = 0; i < indent; i++)
Console.Write(" ");
Console.WriteLine("{0} {1}", child.Name, child.OuterXml.Length);
PrintPreorder(child, ++indent);
}
}
#endregion
#region Private members
protected void ProcessHtmlOverflowCore(XmlNode node, int cutoff)
{
List<XmlNode> terminalNodes = new List<XmlNode>();
while (GetHtmlLength(node.OwnerDocument) > cutoff)
{
XmlNode leaf = node;
// If the node is terminal node, it will be a candidate delete
if (terminalNodes.Contains(node))
{
if (IsCandidate(node.Name))
{
// Check for overflow, remove last child
node = leaf.ParentNode;
node.RemoveChild(leaf);
continue;
}
}
// Get last postorder leaf node
while (leaf != null && leaf.LastChild != null)
{
leaf = leaf.LastChild;
}
// Get the node that can be removed
while (leaf.PreviousSibling != null && !IsCandidate(leaf.Name))
{
leaf = leaf.PreviousSibling;
}
// If no leaf node found, then return
if (leaf == null)
return;
// There are no allowed tags at current depth, thus add this
// node to the list of terminal nodes and continue one level up
if (leaf.PreviousSibling == null)
{
terminalNodes.Add(leaf.ParentNode);
node = leaf.ParentNode;
continue;
}
// Check for overflow, remove last child
node = leaf.ParentNode;
node.RemoveChild(leaf);
}
}
private int GetHtmlLength(XmlDocument doc)
{
if (doc == null)
return 0;
else
return doc.OuterXml.Length;
}
private bool IsCandidate(string name)
{
bool result = false;
foreach (string tag in _candidateTags)
{
if (name.Equals(tag))
{
result = true;
break;
}
}
return result;
}
#endregion
}
And here are the test cases, which will give you an idea about using HtmlOverflow class:
class Program
{
static voidMain(string[] args)
{
HtmlOverflow cut = newHtmlOverflow();
stringtestCase1Xml = "<html><body><span><br/></span></body></html>";
Console.WriteLine(testCase1Xml);
stringtestCase1Result = cut.ProcessHtmlOverflow(testCase1Xml, 44);
Console.WriteLine(testCase1Result);
XmlDocument testCase2Doc = newXmlDocument();
testCase2Doc.LoadXml("<html><body><span><br/></span></body></html>");
cut.PrintPreorder(testCase2Doc.FirstChild, 0);
stringtestCase2Result = cut.ProcessHtmlOverflow(testCase2Doc, 44);
cut.PrintPreorder(testCase2Doc.FirstChild, 0);
XmlDocument testCase3Doc = newXmlDocument();
testCase3Doc.Load("testCase3.htm");
cut.PrintPreorder(testCase3Doc.ChildNodes[1], 0);
stringtestCase2Result = cut.ProcessXmlOverflow(testCase3Doc, 1500);
cut.PrintPreorder(testCase3Doc.ChildNodes[1], 0);
testCase3Doc.Save("testCase3Output.htm");
}
}
This is it.