Pairing HTML tags from an array

Issue

What would be the best way to nicely pair single HTML tags from an array? So im using a regex that cuts all of the tags down in order from a string and puts them into an array, so if i have a string like

"
<div id='one'>
  <span>
    <h1></h1>
  </span>
</div>
<div id='two'>
  <a>
    <span></span>
  </a>
</div>
 "

Without the line breaks ofc, Then it’s matched into an array in the same order the html is, so ["<div id='one'>", "<span>", "<h1>", "</h1>", "</span>", "</div>", "<div id='two'>"] etc, But i’ve been struggling to pair these up in a good way from the array, so it would be something like this

{
  parentHTML = "<div id='one'></div>",
  childrenHTML = "<span><h1></h1></span>",
  children: {
    parentHTML = "<span></span>",
    childrenHTML = "<h1></h1>"
  }
}
{
  parentHTML = "<div id='two'></div>",
  childrenHTML = "<a><span></span></a>",
  children: {
    parentHTML = "<a></a>",
    childrenHTML = "<span></span>"
  }
}

I have almost come up with a solution for this, although its just soo massive and buggy, throws an error in the end aswell, so i mean im just wondering if this could be quite simplified in some way

 function obj(parent) {
  this.parentXML = parent
  this.childrenXML = ''
}
var final = []
var string = "<div id='hey'><span><span></span><div></div><div id='bob'></div><div></div></span></div>"
var str = string.match(/<\/?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)\/?>/g)
str.forEach(function(i, e) {
  if (!i.match(/<\/.+>/)) {
    var parent = new obj(i)
    var name = i.match(/[a-zA-Z0-9.]+/)
    final.push(parent)
    var d = e
    function recall(t) {
      d++
      var val = str[d]
      if (name[0] == val.match(/[a-zA-Z0-9.]+/)[0] && val.match(/<\/.+>/) && t == true) {
        final[e].parentXML += str[d]
        console.log(final)
      } else {
        final[e].childrenXML += str[d]
        if (!val.match(/<\/.+>/)) {
          recall(false)
        } else {
          recall(true)
        }
        console.log(final)
      }
    }
    recall()
  } else {
    var parent = new obj(i)
  }
});
console.log(final)

I mean this might be much to ask for, but i would appreciate any help. 🙂

Solution

I thought this made for an interesting problem to solve, so here’s my implementation.

I separated the logic out into various helper functions to hopefully make it more readable. I’ve added support for recognising HTML elements that are self closing and cannot have children.

const html = `
<div class="header">
  <h1>This is my header</h1>
</div>
<div class="body">
  <p>This is some text and a <a href="#">link</a>.</p>
</div>
<div class="footer">
  <ul>
    <li><a href="#">One</a></li>
    <li><a href="#">Two</a></li>
    <li><a href="#">Three</a></li>
  </ul>
</div>
`;

function parseHTMLElements(html) {
  return html.match(/(<[^>]+>)/gm);
}

function buildTree(elements) {
  const tree = [];
  
  while (elements.length) {
    const element = elements.shift();
    
    if (isOpeningElement(element)) {
      const childTree = buildTree(elements);
      const node = buildNode(element, childTree);
      tree.push(node);
    }
    else if (isSelfClosingElement(element)) {
      const node = buildNode(element);
      tree.push(node);
    }
    else if (isClosingElement(element)) {
      return tree;
    }
  }
  
  return tree;
}

function buildNode(element, children = []) {
  const parentHTML = getParentHTML(element);
  const childrenHTML = getChildrenHTML(children);
  
  return { parentHTML, childrenHTML, children };
}

function isOpeningElement(element) {
  return /^<[^/]+>$/.test(element);
}

function isClosingElement(element) {
  return /^<\/.+>$/.test(element);
}

function isSelfClosingElement(element) {
  return /^<(area|base|br|embed|hr|iframe|img|input|link|meta|param|source|track).*>$/.test(element);
}

function getClosingElement(element) {
  return element.replace(/<\/?([^\s\\/>]+).*>/, '</$1>');
}

function insertHTML(elementHTML, innerHTML) {
  if (isSelfClosingElement(elementHTML)) {
    return elementHTML;
  }
  
  return elementHTML.replace(/^(<[^>]+>)(.*)$/, (matches, openingElement, closingElement) => {
    return `${openingElement}${innerHTML}${closingElement}`;
  });
}

function getParentHTML(openingElement) {
  if (isSelfClosingElement(openingElement)) {
    return openingElement;
  }
  
  const closingElement = getClosingElement(openingElement);
  return `${openingElement}${closingElement}`;
}

function getChildrenHTML(children) {
  if (children.length === 0) {
    return '';
  }
  
  return children.reduce((html, child) => {
    const childHTML = getChildrenHTML(child.children);
    
    return html + insertHTML(child.parentHTML, childHTML);
  }, '');
}

const elements = parseHTMLElements(html);
const tree = buildTree(elements);
console.log(tree);

Answered By – fubar

Answer Checked By – David Marino (AngularFixing Volunteer)

Leave a Reply

Your email address will not be published.