通过JavaScript检测文档中的希伯来语单词

3

我在Web开发方面基本上是新手(尽管对编程有一定了解),因此请原谅任何不正确的术语。

我想构建一个脚本,当添加到HTML页面时,可以检测页面中的每个希伯来语单词,并将该单词转换为HTML元素,例如超链接和标题。

因此,以下内容:

<p>ראש הלשכה</p>

被转换为:

<p><a title="word 1" href="#">הלשכה</a> <a title="word 2" href="#">ראש</a></p>

明白吗?

因此,我想第一件事情是检测页面中的希伯来语单词。我该怎么做呢?除了查看jQuery文档之外,我不知道从哪里开始。


HTML会是Unicode吗?UTF-8,还是可以使用任何编码? - John Knoeller
好问题。让我们简单地说UTF-8。我希望它可以在像http://www.haaretz.co.il这样的文档上工作。 - Judah Gabriel Himango
1个回答

5

在字符串中查找希伯来语单词相当简单。使用与希伯来代码点连续匹配的正则表达式:

/[\u05D0-\u05FF]+/

由于JS支持函数式编程,我们可以轻松地编写自己的函数来遍历文档树,在每个文本节点上调用一个函数。首先,需要一些脚手架。

if (! window.assert) {
    window.dbgLvl = 1; // change this to 0 for production release
    window.assert=function(succeeded, msg) {
        if (dbgLvl && !succeeded) {
            if (!msg) msg = 'assertion failed';
            throw msg;
        }
    }
}

接下来,我们定义一个方法将字符串拆分为一个数组,其中包括分隔符在输出中。

/* String.separate is like String.split, but the result includes the 
   separators.

   These implementations of 'String.separate' will work for our purposes,
   but are buggy in general, due to differences in the implementation of
   String.split.

   The two misbehaviors we correct are including neither grouped patterns 
   nor empty strings in the result, though the latter is only corrected
   when the missing empty string is at the start or the end.
*/
if ('-'.split(/(-)/).length & 1) {
    assert('a'.split(/a/).length, 'split includes grouping but not empty strings');
    // split includes groups in result
    String.prototype.separate = function (separator) {
        if (typeof separator == 'string') {
            if (separator.charAt(0) != '(' 
                || separator.charAt(separator.length-1) != ')')
            {
                separator = new RegExp('(' + separator + ')', 'g');
            } else {
                separator = new RegExp(separator, 'g');
            }
        }
        return this.split(separator);
    }
} else {
    if ('a'.split(/a/).length) {
        // empty strings included, grouped aren't 
        String.prototype.separate = function (separator) {
            if (typeof separator == 'string') {
                separator = new RegExp(separator, 'g');
            }
            var fence = this.match(separator);
            if (!fence) {
                return [this];
            }
            var posts = this.split(separator);
            assert(posts.length = fence.length+1);
            var result = [], i;
            for (i=0; i<fence.length; ++i) {
                result.push(posts[i]);
                result.push(fence[i]);
            }
            result.push(posts[i]);
            return result;
        }
    } else {
        // neither empty strings nor groups are included. IE, you suck.
        String.prototype.separate = function (separator) {
            if (typeof separator == 'string') {
                separator = new RegExp(separator, 'g');
            }
            var fence = this.match(separator);
            if (!fence) {
                return [this];
            }
            var posts = this.split(separator);
            if (posts.length <= fence.length) {
                /* missing some posts. Assume that they are the first or 
                   last, though this won't be true in general.
                */
                if (posts.length < fence.length) {
                    posts.unshift('');
                    posts.push('');
                } else {
                    if (this.substring(0, fence[0].length) == fence[0]) {
                        posts.unshift('');
                    } else {
                        posts.push('');
                    }
                }
            }
            var result = [], i;
            for (i=0; i<fence.length; ++i) {
                result.push(posts[i]);
                result.push(fence[i]);
            }
            result.push(posts[i]);
            return result;
        }
    }
}

接下来是一些节点谓词。
if (! window.Node) {
    window.Node={TEXT_NODE: 3};
} else if (typeof Node.TEXT_NODE == 'undefined') {
    Node.TEXT_NODE = 3;
}

function isTextNode(node) {return node.nodeType == Node.TEXT_NODE;}
function hasKids(node) {return node.childNodes && node.childNodes.length;}
function allNodes(node) {return true;}

现在介绍一些遍历DOM树的函数。
/*
  forEachChild: pre-order traversal of document tree. Applies a function to some nodes, determined by the 'which' and 'descendInto' arguments.

Arguments:
  which  (function): Returns true if 'action' should be applied to a node.
  action (function): Takes a node and does something to it.
  parent (Node): The node to start from.
  descendInto (function, optional): By default, forEachChild will descend into every child that itself has children. Place additional restrictions by passing this argument.
*/
var forEachChild = (function() {
        /* the actual implementation is made a local function so that the
           optional parameter can be handled efficiently.
         */
        function _forEachChild(which, action, node, descendInto) {
            for (var child=node.firstChild; child; child=child.nextSibling) {
                if (which(child)) {
                    action(child);
                }
                if (hasKids(child) && descendInto(child)) {
                    _forEachChild(which, action, child, descendInto);
                }
            }
        }
        return function (which, action, node, descendInto) {
            if (!descendInto) {descendInto=allNodes}
            _forEachChild(which, action, node, descendInto);
        }
    })();

function forEachNode(which, action, descendInto) {
    return forEachChild(which, action, document, descendInto);
}

function forEachTextNode(action, descendInto) {
    return forEachNode(isTextNode, action, descendInto);
}

function forEachTextNodeInBody(action, descendInto) {
    return forEachChild(isTextNode, action, document.body, descendInto);
}

最后一组函数可以将匹配模式的文本节点替换为您选择的新节点。这个组(也就是wrapText返回的函数)尚未进行完整的跨浏览器兼容性测试,包括是否正确处理文本方向。

/* 
   wrapText replaces substrings in a text node with new nodes.

 Arguments:
   pattern (RegExp || string): If a RegExp, must be of the form: '/(...)/g'.
   replace (function): Takes a string and returns a Node to replace the string.

Returns a function that takes a text node.
*/
function wrapText(pattern, replace) {
    return function (node) {
        var chunks = node.nodeValue.separate(pattern);
        if (chunks.length < 2)
            return;
        var wordCount=0;
        var fragment = document.createDocumentFragment();
        var i;
        // don't bother adding first chunk if it's empty.
        if (chunks[0].length) {
            fragment.appendChild(document.createTextNode(chunks[0]));
        }
        for (i=1; i < chunks.length; i+=2) {
            fragment.appendChild(replace(chunks[i])); // †
            fragment.appendChild(document.createTextNode(chunks[i+1])); // ‡
        }
        // clean-up
        assert(i == chunks.length, 'even number of chunks in ['+chunks+'] when it should be odd.');
        /* chunks.length and i will always be odd, thus i == chunks.length
         * when the loop finishes. This means the last element is never
         * missed. 
         * Here's another way of thinking about this. Since the last 
         * (and first) chunk won't match the pattern, it won't be 
         * processed by the line †. The penultimate chunk, however, does
         * match. Assuming the loop condition is correct,the penultimate 
         * chunk must be processed by †, hence the last chunk is 
         * processed by ‡.
         */
        if (! chunks[i-1].length) {
            // last chunk is empty; remove it.
            fragment.removeChild(fragment.lastChild);
        }
        node.parentNode.replaceChild(fragment, node);
    }
}

/*
  createAnchorWrap wraps a string in an anchor node. createAnchorWrap also
  sets the title of the anchor.

Arguments:
  title (string || function, optional): The title for the anchor element. 
      If title is a function, it's called with the string to wrap. If 
      title is a string, wrapper will use a word counter for the title 
      function.

Returns a function that takes a string and returns an anchor element.
 */
function createAnchorWrap(title) {
    if (typeof title == 'string') {
        title=createWordCounter(title);
    } else if (!title) {
        title=createWordCounter();
    }
    return function(word) {
        var a = document.createElement('a');
        a.title=title(word);
        a.appendChild(document.createTextNode(word));
        return a;
    }
}

/*
  createWordCounter creates a word counter, which returns the number of 
  times it's been called (including the current call), prefixed by a string.

Arguments:
  pre (string, optional): prefix for return value.

Returns a function that takes a string (ignored) and returns a string.

 */
function createWordCounter(pre) {
    var wordCount=0;
    if (pre) {
        pre = pre.replace(/ *$/, ' ');
    } else {
        pre = 'word ';
    }
    return function(text) {
        return pre + wordCount;
    }
}

最后一件要做的事情是在(例如)加载处理程序或页面底部的脚本中启动该过程。
forEachTextNodeInBody(wrapText(/([\u05D0-\u05FF]+)/g,
                               createAnchorWrap()));

如果您想更改标题的前缀,请将createWordCounter(...)的结果传递给createAnchorWrap

好的,这是一个开始。所以,Javascript内置支持正则表达式。好的,很棒。现在,关于在HTML文档中查找文本的那部分... - Judah Gabriel Himango
好的,那么你现在已经编写了一些JavaScript函数来遍历树。看起来我可以使用forEachTextNode(action)来将文本元素替换为锚元素。好的,我会尝试看看能做什么。到目前为止,感谢你的帮助。 - Judah Gabriel Himango
请注意,使用JS库(jQuery、Prototype、MooTools等)仍然是一个好主意。 - outis
嗯,我尝试了以下代码:"הלשכה".match(new RegExp("/[\u05D0-\u05FF]+/")),但是返回了false,没有匹配。我做错了什么吗? - Judah Gabriel Himango
JS支持RE字面量。尝试使用"ראש הלשכה".match(/[\u05D0-\u05FF]+/g)。在需要使用RegExp构造函数的情况下(基本上是在需要插入变量时),不要添加分隔符:new RegExp('[' + start + '-' + end + ']') - outis

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接