从文件中字符的索引获取行号

Question

从文件中字符的索引获取行号

javascriptregexstringcoffeescript

5

我有一个字符串输入，其中包含单词。我正在使用regex.exec(g)函数通过getWord（input）函数获取所有单词。

因此，我的输入可能如下所示： word word2 someword blah

从exec中获取的是包含匹配索引的对象。因此，它类似于数组： ["word", index: 0, input: "..."] ... ["someword", index: 11, input: "..."] ...

我需要轻松计算单词" someword"位于第2行，方法是使用索引（11）（因为没有其他值告诉我行数）

这是我想到的：匹配'\n'直到匹配到的\n具有比word更高的索引。不确定在10k行文件中是否可能会出现问题。

实现代码:

getLineFromIndex: (index, input) ->
  regex = /\n/g
  line = 1

  loop
    match = regex.exec(input)
    break if not match? or match.index > index

    line++

  return line

这里可以进行一些比较大的优化。我可以保存正则表达式和上次匹配的结果，这样每次检查行号时就不必遍历整个输入了。只有当上次匹配的位置小于当前位置时，才需要重新执行正则表达式。

以下是最终的优化方案：

  ###
    @variable content [String] is input content
  ###
  getLineFromIndex: (index) ->
    @lineMatcher = @lineMatcher || /\n/g
    @lastLine = @lastLine || 1

    if @eof isnt true
      @lastMatch = @lastMatch || @lineMatcher.exec(@content)

    if @eof or index < @lastMatch.index
      return @lastLine
    else
      match = @lineMatcher.exec(@content)
      if not @eof and match is null
        @eof = true
      else
        @lastMatch = match

      @lastLine++

    return @lastLine

- petomalina

如果您在file的子字符串上使用长度为index的\n进行拆分，这样会是一个净收益吗？那么您立即就有了行号（我认为要减1）。 - Jongware

3个回答

0

你的伪代码看起来能够完成任务。但我不明白你如何通过搜索词的偏移量推断出行号。我会将输入文本按行分割，然后在数组中查找搜索词，如果找到就返回行索引。

var input= "word word2 \n"+
           "someword blah";


function getLinesNumberOf( input, word){
  var line_numbers=[];
  input.split("\n").forEach(function(line, index){
    if( line.indexOf(word)>=0 ) line_numbers.push(index);
  });
  return line_numbers;
}


console.log(getLinesNumberOf(input,"someword"));

我已经添加了对搜索词多次出现的支持。

编辑

为避免在处理大量输入时过度消耗内存，您可以顺序解析（SAX与DOM相同的优点）：

function getLinesNumberOf( word, input ){

    input+= "\n";//make sure to not miss the last line;

    var line_numbers=[], current_line=0;
    var startline_offset=0;

    do{
        //get the offset next of the next breakline 
        endline_offset= input.indexOf("\n",startline_offset);

        //get the offset of the searched word in the line 
        word_offset= input.substring(startline_offset,endline_offset).indexOf(word, 0);

        //check if the searched word has been found and if it has been found on current_line
        if( word_offset >= 0 && word_offset < endline_offset ) {
            //if true the current_line is stored
            line_numbers.push(current_line);
        }

        //the offset of the next line is just after the breakline offset  
        startline_offset= endline_offset+1;

        current_line++;

    }while(endline_offset>=0);//we continue while a breakline is found

    console.log(line_numbers);
}

- Gaël Barbin

假设你有大量（可能很大）的文件。将它们全部拆分成数组似乎不是一个好主意。但是，对于我给出的代码片段，有一种优化方法。你可以实际上保存正则表达式，这样你就不必每次都遍历文件了。 - petomalina

0

gwer提出的原始解决方案是这样的：

function getLineNumber(text, index) {
  return text.slice(0, index).split('\n').length;
}

然而，还有更快的解决方案：

function getLineNumber(text, index) {
  const match = text.slice(0, index).match(/\n/g);
  return (match ? match.length : 0) + 1;
}

这个是根据我的有限基准测试来看最快的，而且由于它根本不对输入文本进行任何操作，所以应该使用最少的内存。

function getLineNumberB(text, index) {
  let line = 1;
  for (let i = 0; i < index; i++) {
    if (text[i] === '\n') {
      line++;
    }
  }
  return line;
}

如果您想处理不同的可能的行尾，可以选择预处理文本（推荐）：

text = text.replace(/\r\n|\r/g, '\n');

或者你可以使用这些更复杂的解决方案：

function getLineNumber(text, index) {
  const match = text.slice(0, index).match(/\r\n|\r|\n/g);
  return (match ? match.length : 0) + 1;
}

function getLineNumber(text, index) {
    let line = 1;
    for (let i = 0; i < index; i++) {
      if (text[i] === '\n') {
        line++;
      }
      if (text[i] === '\r') {
        // A line feed after a carriage return counts as part of the same newline
        if (text[i + 1] === '\n') {
          i++;
        }
        line++;
      }
    }
    return line;
  }

- peterjwest

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- gwer · Accepted Answer

截取输入内容 (a.substr(0, 11))。
分割内容 (a.substr(0, 11).split('\n'))。
计算行数 (a.substr(0, 11).split('\n').length)。