套接字无法识别协议(套接字泄漏)

22

我有一个Go1.5.1的进程/应用程序。当我在该进程上运行/usr/sbin/lsof -p时,我看到很多“无法识别的协议”。

monitor_ 13105 root  101u  sock      0,6      0t0 16960100 can't identify protocol
monitor_ 13105 root  102u  sock      0,6      0t0 21552427 can't identify protocol
monitor_ 13105 root  103u  sock      0,6      0t0 17565091 can't identify protocol
monitor_ 13105 root  104u  sock      0,6      0t0 18476870 can't identify protocol

进程状态/限制/文件描述符

[root@Monitor_q ~]# cat /proc/13105/status 
Name:   monitor_client
State:  S (sleeping)
Tgid:   13105
Pid:    13105
PPid:   13104
TracerPid:  0
Uid:    0   0   0   0
Gid:    0   0   0   0
Utrace: 0
FDSize: 16384
Groups: 
...


[root@Monitor_q ~]# cat /proc/13105/limits 
Limit                     Soft Limit           Hard Limit           Units     
Max cpu time              unlimited            unlimited            seconds   
Max file size             unlimited            unlimited            bytes     
Max data size             unlimited            unlimited            bytes     
Max stack size            10485760             unlimited            bytes     
Max core file size        0                    unlimited            bytes     
Max resident set          unlimited            unlimited            bytes     
Max processes             3870                 3870                 processes 
Max open files            9999                 9999                 files     
Max locked memory         65536                65536                bytes     
Max address space         unlimited            unlimited            bytes     
Max file locks            unlimited            unlimited            locks     
Max pending signals       3870                 3870                 signals   
Max msgqueue size         819200               819200               bytes     
Max nice priority         0                    0                    
Max realtime priority     0                    0                    
Max realtime timeout      unlimited            unlimited            us

[root@Monitor_q ~]# ll /proc/13105/fd/
lrwx------ 1 root root 64 Dec  7 00:15 8382 -> socket:[52023221]
lrwx------ 1 root root 64 Dec  7 00:15 8383 -> socket:[51186627]
lrwx------ 1 root root 64 Dec  7 00:15 8384 -> socket:[51864232]
lrwx------ 1 root root 64 Dec  7 00:15 8385 -> socket:[52435453]
lrwx------ 1 root root 64 Dec  7 00:15 8386 -> socket:[51596071]
lrwx------ 1 root root 64 Dec  7 00:15 8387 -> socket:[52767667]
lrwx------ 1 root root 64 Dec  7 00:15 8388 -> socket:[52090632]
lrwx------ 1 root root 64 Dec  7 00:15 8389 -> socket:[51739068]
lrwx------ 1 root root 64 Dec  7 00:15 839 -> socket:[22963529]
lrwx------ 1 root root 64 Dec  7 00:15 8390 -> socket:[52023223]
lrwx------ 1 root root 64 Dec  7 00:15 8391 -> socket:[52560389]
lrwx------ 1 root root 64 Dec  7 00:15 8392 -> socket:[52402565]
...

但是在netstat -a中没有类似的输出。

这些套接字是什么,我怎样才能找出它们的作用?

monitor_client.go

package main

import (
    "crypto/tls"
    "encoding/json"
    "fmt"
    "log"
    "net"
    "net/http"
    nurl "net/url"
    "strconv"
    "strings"
    "syscall"
    "time"
)

type Result struct {
    Error      string        `json:"error"`
    HttpStatus int           `json:"http_status"`
    Stime      time.Duration `json:"http_time"`
}

//http://stackoverflow.com/questions/20990332/golang-http-timeout-and-goroutines-accumulation
//http://3.3.3.3/http?host=3.2.4.2&servername=a.test&path=/&port=33&timeout=5&scheme=http
func MonitorHttp(w http.ResponseWriter, r *http.Request) {
    var host, servername, path, port, scheme string
    var timeout int
    u, err := nurl.Parse(r.RequestURI)
    if err != nil {
        log.Fatal(err)
        return
    }
    if host = u.Query().Get("host"); host == "" {
        host = "127.0.0.0"
    }
    if servername = u.Query().Get("servername"); servername == "" {
        servername = "localhost"
    }
    if path = u.Query().Get("path"); path == "" {
        path = "/"
    }
    if port = u.Query().Get("port"); port == "" {
        port = "80"
    }
    if scheme = u.Query().Get("scheme"); scheme == "" {
        scheme = "http"
    }

    if timeout, _ = strconv.Atoi(u.Query().Get("timeout")); timeout == 0 {
        timeout = 5
    }

    //log.Printf("(host)=%s (servername)=%s (path)=%s (port)=%s (timeout)=%d", host, servername, path, port, timeout)

    w.Header().Set("Content-Type", "application/json")

    res := httptool(host, port, servername, scheme, path, timeout)
    result, _ := json.Marshal(res)
    fmt.Fprintf(w, "%s", result)
}

func httptool(ip, port, servername, scheme, path string, timeout int) Result {

    var result Result
    startTime := time.Now()
    host := ip + ":" + port

    transport := &http.Transport{
        TLSClientConfig:   &tls.Config{InsecureSkipVerify: true},
        DisableKeepAlives: true,
    }

    dialer := net.Dialer{
        Timeout:   time.Duration(timeout) * time.Second,
        KeepAlive: 0 * time.Second,
    }
    transport.Dial = func(network, address string) (net.Conn, error) {
        return dialer.Dial(network, address)
    }

    client := &http.Client{
        Transport: transport,
    }
    rawquery := ""
    url := fmt.Sprintf("%s://%s%s%s", scheme, host, path, rawquery)
    req, err := http.NewRequest("GET", url, nil)
    if err != nil {
        result.HttpStatus = -1
        errs := strings.Split(err.Error(), ": ")
        result.Error = errs[len(errs)-1]
        result.Stime = time.Now().Sub(startTime) / time.Millisecond
        return result
    }
    req.Header.Set("User-Agent", "monitor worker")
    req.Header.Set("Connection", "close")
    req.Host = servername
    resp, err := client.Do(req)
    //https://github.com/Basiclytics/neverdown/blob/master/check.go
    if err != nil {
        nerr, ok := err.(*nurl.Error)
        if ok {
            switch cerr := nerr.Err.(type) {
            case *net.OpError:
                switch cerr.Err.(type) {
                case *net.DNSError:
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "dns: " + errs[len(errs)-1]
                default:
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "server: " + errs[len(errs)-1]
                }
            default:
                switch nerr.Err.Error() {
                case "net/http: request canceled while waiting for connection":
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "timeout: " + errs[len(errs)-1]

                default:
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "unknown: " + errs[len(errs)-1]
                }
            }

        } else {
            result.Error = "unknown: " + err.Error()
        }
        result.HttpStatus = -2
        result.Stime = time.Now().Sub(startTime) / time.Millisecond
        return result
    }
    resp.Body.Close()
    result.HttpStatus = resp.StatusCode
    result.Error = "noerror"
    result.Stime = time.Now().Sub(startTime) / time.Millisecond //spend time (ms)
    return result
}

func setRlimit() {
    var rLimit syscall.Rlimit
    err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &rLimit)
    if err != nil {
        log.Printf("Unable to obtain rLimit", err)
    }
    if rLimit.Cur < rLimit.Max {
        rLimit.Max = 9999
        rLimit.Cur = 9999
        err = syscall.Setrlimit(syscall.RLIMIT_NOFILE, &rLimit)
        if err != nil {
            log.Printf("Unable to increase number of open files limit", err)
        }
    }
}

func main() {
    setRlimit()
    s := &http.Server{
        Addr:         ":59059",
        ReadTimeout:  7 * time.Second,
        WriteTimeout: 7 * time.Second,
    }
    http.HandleFunc("/http", MonitorHttp)

    log.Fatal(s.ListenAndServe())
}

4
为什么每次通话都要构建整个客户端、拨号器、传输等堆栈?为什么不使用单个客户端处理所有事情?客户端可以进行连接池和回收等操作。 - Not_a_Golfer
8
你的请求没有超时时间,所以任何卡住的请求都会保持连接处于打开状态。你还禁用了TCP keepalives,所以无法检测到断开的连接。 - JimB
2
代码中可能存在的其他问题:您的错误处理不正确,而且为了稍微更改错误字符串而非常冗长。您无缘无故地第二次发送了Connection: close。服务器端有读/写超时,但没有办法提前退出处理程序(这可能是另一个导致套接字无法关闭的地方)。除非您有理由覆盖它,否则不应为每个调用创建新的传输(或客户端),并使用DefaultTransport。您在注释中引用的代码示例实际上并不正确,请参阅官方文档。 - JimB
2
确实,正如@JimB所怀疑的那样,“lsof”会为半开放套接字打印“无法识别协议”。 - BadZen
4
可能是Seeing too many lsof can't identify protocol的重复问题。 - Kerem
显示剩余2条评论
2个回答

2
这里有几个要点。
首先,我无法重现你的行为。不管怎样,“无法识别协议”通常与套接字未正确关闭有关。
一些评论者建议你不必在每个处理程序内创建http客户端——这是正确的。只需创建一次并重复使用即可。
其次,我不确定为什么要创建自己的“http.Client”结构体以及为什么要禁用keepalives。你不能只使用“http.Get”吗?更简单的代码更容易调试。
第三,不确定为什么要覆盖“transport.Dial”函数。即使你必须这样做,文档(针对Go 1.9.2)也说:
% go doc http.transport.dial
type Transport struct {
    // Dial specifies the dial function for creating unencrypted TCP
    connections.
    //
    // Deprecated: Use DialContext instead, which allows the transport
    // to cancel dials as soon as they are no longer needed.
    // If both are set, DialContext takes priority.
    Dial func(network, addr string) (net.Conn, error)

那句有关弃用和缺少旋钮重用的评论可能指向你问题的源头。

总之,如果我是你,我会做两件事:

  • 将客户端创建移至只执行一次的代码中,或者仅使用默认客户端 http.Get
  • 清理覆盖默认传输字段的内容,如果必须这样做,则应按照建议使用 DialContext

祝好运。


0

我无法重现这个问题。但是这是我的两分钱(没有双关语)

  1. 在一个文章https://idea.popcount.org/2012-12-09-lsof-cant-identify-protocol/中发现了类似的问题,据观察该问题出现在FreeBSD上的SockJS-node中。但问题是“websockets没有被正确清理”。
  2. 如果您仍然拥有相同的环境,请进行另一个测试。如果可能,请发布wireshark日志。只是为了确认网络帧中没有微妙的东西导致了这个问题。

很抱歉我不能安装Go 1.5.1来重现这个问题。 希望这对你有帮助。


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接