我最近遇到了一个有趣的关于字符串的问题。假设你有以下内容:
Input string1: "this is a test string"
Input string2: "tist"
Output string: "t stri"
那么,根据上述情况,我该如何找到包含字符串2中所有字符的字符串1的最小子串呢?
如果想要查看更多细节和完整的代码,请参考我的博客文章:
http://www.leetcode.com/2010/11/finding-minimum-window-in-s-which.html
为了帮助说明这个方法,我使用一个例子:string1 = "acbbaca"
,string2 = "aba"
。在这里我们还使用了术语“窗口”,这意味着来自string1的一连串字符块(可以与子字符串交换)。
i) string1 = "acbbaca",string2 = "aba"。
ii) 找到了第一个最小的窗口。请注意,由于hasFound ['a'] == needToFind ['a'] == 2,因此我们无法推进begin指针。推进将意味着破坏约束条件。
iii) 找到了第二个窗口。begin指针仍然指向第一个元素'a'。hasFound ['a'] (3)大于needToFind ['a'] (2)。我们将hasFound ['a']减1并将begin指针向右移动。
iv) 我们跳过字符'c',因为它在string2中没有找到。现在begin指针指向'b'。hasFound ['b'] (2)大于needToFind ['b'] (1)。我们将hasFound ['b']减1并将begin指针向右移动。
v) 现在开始指针指向下一个 'b'。hasFound['b'] (1) 等于 needToFind['b'] (1)。我们立即停止,这是我们新发现的最小窗口。
这个想法主要基于两个指针(窗口的起始和结束位置)和两个表(needToFind 和 hasFound)来遍历 string1。needToFind 存储 string2 中一个字符的总数,而 hasFound 存储迄今为止遇到的一个字符的总数。我们还使用 count 变量来存储迄今为止遇到的 string2 的字符总数(不计算 hasFound[x] 超过 needToFind[x] 的字符)。当 count 等于 string2 的长度时,我们知道找到了一个有效的窗口。
每次我们推进 end 指针(指向元素 x),我们将 hasFound[x] 增加一。如果 hasFound[x] 小于或等于 needToFind[x],我们也将 count 增加一。为什么?当满足约束条件时(即,count 等于 string2 的大小),我们立即将 begin 指针尽可能向右推进,同时保持约束条件。
如何检查它是否保持了约束条件?假设 begin 指向元素 x,我们检查 hasFound[x] 是否大于 needToFind[x]。如果是,我们可以将 hasFound[x] 减一并推进 begin 指针而不破坏约束条件。另一方面,如果不是,我们立即停止,因为推进 begin 指针会破坏窗口约束条件。
最后,我们检查最小窗口长度是否小于当前最小值。如果发现新的最小值,则更新当前的最小值。
本质上,该算法找到满足约束条件的第一个窗口,然后继续保持约束条件。
needToFind
是从模式字符串string2
计算出来的直方图,它在一开始就被计算出来并且永远不会改变。在这个例子中,needToFind = {'a' : 2, 'b': 1}
。另一方面,hasFound
是当前滑动窗口中字符的直方图。 - ArunO(N+M)
时间和O(1)
空间内对直方图进行扫描,其中N
是第一个字符串中字符的数量,M
是第二个字符串中字符的数量。hist2[s2 [i]] ++
)。a[i]>0 && b[i]>0
和a[i]>=b[i]
之间的差异)。O(|set(M)|)
,其中|set(M)|
是M
中唯一字符的数量。 - Rex Kerrfrom collections import defaultdict
def smallest(s1, s2):
assert s2 != ''
d = defaultdict(int)
nneg = [0] # number of negative entries in d
def incr(c):
d[c] += 1
if d[c] == 0:
nneg[0] -= 1
def decr(c):
if d[c] == 0:
nneg[0] += 1
d[c] -= 1
for c in s2:
decr(c)
minlen = len(s1) + 1
j = 0
for i in xrange(len(s1)):
while nneg[0] > 0:
if j >= len(s1):
return minlen
incr(s1[j])
j += 1
minlen = min(minlen, j - i)
decr(s1[i])
return minlen
i
标记子字符串的尾部,j
标记头部。@algorithmist:干得好,你想出了比我描述更快一点的代码! - Rex Kerr我收到了同样的面试问题。我是一名C++候选人,但我能够比较快地用JAVA编码。
Java [翻译:Sumod Mathilakath]
import java.io.*;
import java.util.*;
class UserMainCode
{
public String GetSubString(String input1,String input2){
// Write code here...
return find(input1, input2);
}
private static boolean containsPatternChar(int[] sCount, int[] pCount) {
for(int i=0;i<256;i++) {
if(pCount[i]>sCount[i])
return false;
}
return true;
}
public static String find(String s, String p) {
if (p.length() > s.length())
return null;
int[] pCount = new int[256];
int[] sCount = new int[256];
// Time: O(p.lenght)
for(int i=0;i<p.length();i++) {
pCount[(int)(p.charAt(i))]++;
sCount[(int)(s.charAt(i))]++;
}
int i = 0, j = p.length(), min = Integer.MAX_VALUE;
String res = null;
// Time: O(s.lenght)
while (j < s.length()) {
if (containsPatternChar(sCount, pCount)) {
if ((j - i) < min) {
min = j - i;
res = s.substring(i, j);
// This is the smallest possible substring.
if(min==p.length())
break;
// Reduce the window size.
sCount[(int)(s.charAt(i))]--;
i++;
}
} else {
sCount[(int)(s.charAt(j))]++;
// Increase the window size.
j++;
}
}
System.out.println(res);
return res;
}
}
C++ 【转载:sundeepblue】
#include <iostream>
#include <vector>
#include <string>
#include <climits>
using namespace std;
string find_minimum_window(string s, string t) {
if(s.empty() || t.empty()) return;
int ns = s.size(), nt = t.size();
vector<int> total(256, 0);
vector<int> sofar(256, 0);
for(int i=0; i<nt; i++)
total[t[i]]++;
int L = 0, R;
int minL = 0; //gist2
int count = 0;
int min_win_len = INT_MAX;
for(R=0; R<ns; R++) { // gist0, a big for loop
if(total[s[R]] == 0) continue;
else sofar[s[R]]++;
if(sofar[s[R]] <= total[s[R]]) // gist1, <= not <
count++;
if(count == nt) { // POS1
while(true) {
char c = s[L];
if(total[c] == 0) { L++; }
else if(sofar[c] > total[c]) {
sofar[c]--;
L++;
}
else break;
}
if(R - L + 1 < min_win_len) { // this judge should be inside POS1
min_win_len = R - L + 1;
minL = L;
}
}
}
string res;
if(count == nt) // gist3, cannot forget this.
res = s.substr(minL, min_win_len); // gist4, start from "minL" not "L"
return res;
}
int main() {
string s = "abdccdedca";
cout << find_minimum_window(s, "acd");
}
Erlang [Courtesy : wardbekker]
-module(leetcode).
-export([min_window/0]).
%% Given a string S and a string T, find the minimum window in S which will contain all the characters in T in complexity O(n).
%% For example,
%% S = "ADOBECODEBANC"
%% T = "ABC"
%% Minimum window is "BANC".
%% Note:
%% If there is no such window in S that covers all characters in T, return the emtpy string "".
%% If there are multiple such windows, you are guaranteed that there will always be only one unique minimum window in S.
min_window() ->
"eca" = min_window("cabeca", "cae"),
"eca" = min_window("cfabeca", "cae"),
"aec" = min_window("cabefgecdaecf", "cae"),
"cwae" = min_window("cabwefgewcwaefcf", "cae"),
"BANC" = min_window("ADOBECODEBANC", "ABC"),
ok.
min_window(T, S) ->
min_window(T, S, []).
min_window([], _T, MinWindow) ->
MinWindow;
min_window([H | Rest], T, MinWindow) ->
NewMinWindow = case lists:member(H, T) of
true ->
MinWindowFound = fullfill_window(Rest, lists:delete(H, T), [H]),
case length(MinWindow) == 0 orelse (length(MinWindow) > length(MinWindowFound)
andalso length(MinWindowFound) > 0) of
true ->
MinWindowFound;
false ->
MinWindow
end;
false ->
MinWindow
end,
min_window(Rest, T, NewMinWindow).
fullfill_window(_, [], Acc) ->
%% window completed
Acc;
fullfill_window([], _T, _Acc) ->
%% no window found
"";
fullfill_window([H | Rest], T, Acc) ->
%% completing window
case lists:member(H, T) of
true ->
fullfill_window(Rest, lists:delete(H, T), Acc ++ [H]);
false ->
fullfill_window(Rest, T, Acc ++ [H])
end.
REF:
//-----------------------------------------------------------------------
bool IsInSet(char ch, char* cSet)
{
char* cSetptr = cSet;
int index = 0;
while (*(cSet+ index) != '\0')
{
if(ch == *(cSet+ index))
{
return true;
}
++index;
}
return false;
}
void removeChar(char ch, char* cSet)
{
bool bShift = false;
int index = 0;
while (*(cSet + index) != '\0')
{
if( (ch == *(cSet + index)) || bShift)
{
*(cSet + index) = *(cSet + index + 1);
bShift = true;
}
++index;
}
}
typedef struct subStr
{
short iStart;
short iEnd;
short szStr;
}ss;
char* subStringSmallest(char* testStr, char* cSet)
{
char* subString = NULL;
int iSzSet = strlen(cSet) + 1;
int iSzString = strlen(testStr)+ 1;
char* cSetBackUp = new char[iSzSet];
memcpy((void*)cSetBackUp, (void*)cSet, iSzSet);
int iStartIndx = -1;
int iEndIndx = -1;
int iIndexStartNext = -1;
std::vector<ss> subStrVec;
int index = 0;
while( *(testStr+index) != '\0' )
{
if (IsInSet(*(testStr+index), cSetBackUp))
{
removeChar(*(testStr+index), cSetBackUp);
if(iStartIndx < 0)
{
iStartIndx = index;
}
else if( iIndexStartNext < 0)
iIndexStartNext = index;
else
;
if (strlen(cSetBackUp) == 0 )
{
iEndIndx = index;
if( iIndexStartNext == -1)
break;
else
{
index = iIndexStartNext;
ss stemp = {iStartIndx, iEndIndx, (iEndIndx-iStartIndx + 1)};
subStrVec.push_back(stemp);
iStartIndx = iEndIndx = iIndexStartNext = -1;
memcpy((void*)cSetBackUp, (void*)cSet, iSzSet);
continue;
}
}
}
else
{
if (IsInSet(*(testStr+index), cSet))
{
if(iIndexStartNext < 0)
iIndexStartNext = index;
}
}
++index;
}
int indexSmallest = 0;
for(int indexVec = 0; indexVec < subStrVec.size(); ++indexVec)
{
if(subStrVec[indexSmallest].szStr > subStrVec[indexVec].szStr)
indexSmallest = indexVec;
}
subString = new char[(subStrVec[indexSmallest].szStr) + 1];
memcpy((void*)subString, (void*)(testStr+ subStrVec[indexSmallest].iStart), subStrVec[indexSmallest].szStr);
memset((void*)(subString + subStrVec[indexSmallest].szStr), 0, 1);
delete[] cSetBackUp;
return subString;
}
//--------------------------------------------------------------------
我已经使用Python3以O(N)的效率实现了它:
def get(s, alphabet="abc"):
seen = {}
for c in alphabet:
seen[c] = 0
seen[s[0]] = 1
start = 0
end = 0
shortest_s = 0
shortest_e = 99999
while end + 1 < len(s):
while seen[s[start]] > 1:
seen[s[start]] -= 1
start += 1
# Constant time check:
if sum(seen.values()) == len(alphabet) and all(v == 1 for v in seen.values()) and \
shortest_e - shortest_s > end - start:
shortest_s = start
shortest_e = end
end += 1
seen[s[end]] += 1
return s[shortest_s: shortest_e + 1]
print(get("abbcac")) # Expected to return "bca"
这是一种使用质数来避免一个循环,并用乘法替换它的方法。还可以进行几个其他次要的优化。
为您想要查找的任何字符分配唯一的质数,对于不感兴趣的字符分配1
。
通过将质数与它应该具有的出现次数相乘来查找匹配字符串的乘积。现在,只有使用相同的质因数才能找到此乘积。
从开头搜索字符串,随着移动到运行产品中,乘以相应的质数。
如果数字大于正确的总和,请删除第一个字符并将其质数除以您的运行产品。
如果数字小于正确的总和,请包括下一个字符并将其乘入您的运行产品。
如果数字与正确的总和相同,则找到了匹配项,将开始和结束滑动到下一个字符并继续搜索其他匹配项。
决定哪个匹配项最短。
charcount = { 'a': 3, 'b' : 1 };
str = "kjhdfsbabasdadaaaaasdkaaajbajerhhayeom"
def find (c, s):
Ns = len (s)
C = list (c.keys ())
D = list (c.values ())
# prime numbers assigned to the first 25 chars
prmsi = [ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89 , 97]
# primes used in the key, all other set to 1
prms = []
Cord = [ord(c) - ord('a') for c in C]
for e,p in enumerate(prmsi):
if e in Cord:
prms.append (p)
else:
prms.append (1)
# Product of match
T = 1
for c,d in zip(C,D):
p = prms[ord (c) - ord('a')]
T *= p**d
print ("T=", T)
t = 1 # product of current string
f = 0
i = 0
matches = []
mi = 0
mn = Ns
mm = 0
while i < Ns:
k = prms[ord(s[i]) - ord ('a')]
t *= k
print ("testing:", s[f:i+1])
if (t > T):
# included too many chars: move start
t /= prms[ord(s[f]) - ord('a')] # remove first char, usually division by 1
f += 1 # increment start position
t /= k # will be retested, could be replaced with bool
elif t == T:
# found match
print ("FOUND match:", s[f:i+1])
matches.append (s[f:i+1])
if (i - f) < mn:
mm = mi
mn = i - f
mi += 1
t /= prms[ord(s[f]) - ord('a')] # remove first matching char
# look for next match
i += 1
f += 1
else:
# no match yet, keep searching
i += 1
return (mm, matches)
print (find (charcount, str))
编辑:显然有一个O(n)算法(参见algorithmist的答案)。很明显,这将打败下面描述的[天真]基线!
太糟糕了,我得走了……我有点怀疑我们能否达到O(n)。我明天会回来看看谁是赢家的;-) 玩得开心!
暂定算法:
一般的想法是顺序地尝试并使用在str1中找到的str2字符作为搜索的起点(在任一/两个方向上),以查找str2的所有其他字母。通过保持“迄今为止最佳匹配长度”值,我们可以在超过此值时中止搜索。其他启发式方法可能可以用于进一步中止次优(到目前为止)的解决方案。在str1中起始字母的选择很重要;建议从str1中计数最低的字母开始,并在随后的尝试中逐渐增加计数的其他字母。
[loose pseudo-code]
- get count for each letter/character in str1 (number of As, Bs etc.)
- get count for each letter in str2
- minLen = length(str1) + 1 (the +1 indicates you're not sure all chars of
str2 are in str1)
- Starting with the letter from string2 which is found the least in string1,
look for other letters of Str2, in either direction of str1, until you've
found them all (or not, at which case response = impossible => done!).
set x = length(corresponding substring of str1).
- if (x < minLen),
set minlen = x,
also memorize the start/len of the str1 substring.
- continue trying with other letters of str1 (going the up the frequency
list in str1), but abort search as soon as length(substring of strl)
reaches or exceed minLen.
We can find a few other heuristics that would allow aborting a
particular search, based on [pre-calculated ?] distance between a given
letter in str1 and some (all?) of the letters in str2.
- the overall search terminates when minLen = length(str2) or when
we've used all letters of str1 (which match one letter of str2)
as a starting point for the search
JavaScript暴力解决方案:
function shortestSubStringOfUniqueChars(s){
var uniqueArr = [];
for(let i=0; i<s.length; i++){
if(uniqueArr.indexOf(s.charAt(i)) <0){
uniqueArr.push(s.charAt(i));
}
}
let windoww = uniqueArr.length;
while(windoww < s.length){
for(let i=0; i<s.length - windoww; i++){
let match = true;
let tempArr = [];
for(let j=0; j<uniqueArr.length; j++){
if(uniqueArr.indexOf(s.charAt(i+j))<0){
match = false;
break;
}
}
let checkStr
if(match){
checkStr = s.substr(i, windoww);
for(let j=0; j<uniqueArr.length; j++){
if(uniqueArr.indexOf(checkStr.charAt(j))<0){
match = false;
break;
}
}
}
if(match){
return checkStr;
}
}
windoww = windoww + 1;
}
}
console.log(shortestSubStringOfUniqueChars("ABA"));
//[ShortestSubstring.java][1]
public class ShortestSubstring {
public static void main(String[] args) {
String input1 = "My name is Fran";
String input2 = "rim";
System.out.println(getShortestSubstring(input1, input2));
}
private static String getShortestSubstring(String mainString, String toBeSearched) {
int mainStringLength = mainString.length();
int toBeSearchedLength = toBeSearched.length();
if (toBeSearchedLength > mainStringLength) {
throw new IllegalArgumentException("search string cannot be larger than main string");
}
for (int j = 0; j < mainStringLength; j++) {
for (int i = 0; i <= mainStringLength - toBeSearchedLength; i++) {
String substring = mainString.substring(i, i + toBeSearchedLength);
if (checkIfMatchFound(substring, toBeSearched)) {
return substring;
}
}
toBeSearchedLength++;
}
return null;
}
private static boolean checkIfMatchFound(String substring, String toBeSearched) {
char[] charArraySubstring = substring.toCharArray();
char[] charArrayToBeSearched = toBeSearched.toCharArray();
int count = 0;
for (int i = 0; i < charArraySubstring.length; i++) {
for (int j = 0; j < charArrayToBeSearched.length; j++) {
if (String.valueOf(charArraySubstring[i]).equalsIgnoreCase(String.valueOf(charArrayToBeSearched[j]))) {
count++;
}
}
}
return count == charArrayToBeSearched.length;
}
}
string2
中是否也需要考虑重复项?否则,包含string1
中的tist
的最短子串是this
或stri
。 - Anurag