一个将字符串按标点符号(例如“。”、“,”、“;”、“?”)分块的示例函数(在字符最小和最大长度之间);换句话说,优先考虑标点符号而不是字符长度:
import numpy as np
def chunkingStringFunction(strings, charactersDefiningChunking = [".", ",", ";", "?"], numberOfMaximumCharactersPerChunk = None, numberOfMinimumCharactersPerChunk = None, **kwargs):
if numberOfMaximumCharactersPerChunk is None:
numberOfMaximumCharactersPerChunk = 100
if numberOfMinimumCharactersPerChunk is None:
numberOfMinimumCharactersPerChunk = 2
storingChunksOfString = []
for string in strings:
chunkingStartingAtThisIndex = 0
indexingCharactersInStrings = 0
while indexingCharactersInStrings < len(string) - 1:
indexingCharactersInStrings += 1
currentChunk = string[chunkingStartingAtThisIndex:indexingCharactersInStrings + 1]
if len(currentChunk) >= numberOfMinimumCharactersPerChunk and len(currentChunk) <= numberOfMaximumCharactersPerChunk:
indexesForStops = []
for indexingCharacterDefiningChunking in range(len(charactersDefiningChunking)):
indexesForStops.append(currentChunk.find(charactersDefiningChunking[indexingCharacterDefiningChunking]) + chunkingStartingAtThisIndex)
indexesForStops = np.max(indexesForStops, axis = None)
addChunk = string[chunkingStartingAtThisIndex:indexesForStops + 1]
if len(addChunk) > 1 and addChunk != " ":
storingChunksOfString.append(addChunk)
chunkingStartingAtThisIndex = indexesForStops + 1
indexingCharactersInStrings = chunkingStartingAtThisIndex
return storingChunksOfString
另外,如果我们想要考虑平均字符长度,并从中找出分块的定义字符,则可以优先考虑字符长度。
import numpy as np
def chunkingStringFunction(strings, charactersDefiningChunking = [".", ",", ";", "?"], averageNumberOfCharactersPerChunk = None, **kwargs):
if averageNumberOfCharactersPerChunk is None:
averageNumberOfCharactersPerChunk = 10
storingChunksOfString = []
for string in strings:
lastIndexChunked = 0
for indexingCharactersInString in range(1, len(string), 1):
chunkStopsAtADefinedCharacter = False
if indexingCharactersInString - lastIndexChunked == averageNumberOfCharactersPerChunk:
indexingNumberOfCharactersAwayFromAverageChunk = 1
while chunkStopsAtADefinedCharacter == False:
indexingNumberOfCharactersAwayFromAverageChunk += 1
for thisCharacter in charactersDefiningChunking:
findingAChunkCharacter = string[indexingCharactersInString - indexingNumberOfCharactersAwayFromAverageChunk:indexingCharactersInString + (indexingNumberOfCharactersAwayFromAverageChunk + 1)].find(thisCharacter)
if findingAChunkCharacter > -1 and len(string[lastIndexChunked:indexingCharactersInString - indexingNumberOfCharactersAwayFromAverageChunk + findingAChunkCharacter + 1]) != 0:
storingChunksOfString.append(string[lastIndexChunked:indexingCharactersInString - indexingNumberOfCharactersAwayFromAverageChunk + findingAChunkCharacter + 1])
lastIndexChunked = indexingCharactersInString - indexingNumberOfCharactersAwayFromAverageChunk + findingAChunkCharacter + 1
chunkStopsAtADefinedCharacter = True
elif indexingCharactersInString == len(string) - 1 and lastIndexChunked != len(string) - 1 and len(string[lastIndexChunked:indexingCharactersInString + 1]) != 0:
storingChunksOfString.append(string[lastIndexChunked:indexingCharactersInString + 1])
return storingChunksOfString