mirror of
https://git.FreeBSD.org/src.git
synced 2026-06-02 11:24:32 +00:00
inpcb: Split PCB hash tables
Currently we use a single hash table per PCB database for connected and bound PCBs. Since we started using net_epoch to synchronize hash table lookups, there's been a bug, noted in a comment above in_pcbrehash(): connecting a socket can cause an inpcb to move between hash chains, and this can cause a concurrent lookup to follow the wrong linkage pointers. I believe this could cause rare, spurious ECONNREFUSED errors in the worse case. Address the problem by introducing a second hash table and adding more linkage pointers to struct inpcb. Now the database has one table each for connected and unconnected sockets. When inserting an inpcb into the hash table, in_pcbinhash() now looks at the foreign address of the inpcb to figure out which table to use. This ensures that queue linkage pointers are stable until the socket is disconnected, so the problem described above goes away. There is also a small benefit in that in_pcblookup_*() can now search just one of the two possible hash buckets. I also made the "rehash" parameter of in(6)_pcbconnect() unused. This parameter seems confusing and it is simpler to let the inpcb code figure out what to do using the existing INP_INHASHLIST flag. UDP sockets pose a special problem since they can be connected and disconnected multiple times during their lifecycle. To handle this, the patch plugs a hole in the inpcb structure and uses it to store an SMR sequence number. When an inpcb is disconnected - an operation which requires the global PCB database hash lock - the write sequence number is advanced, and in order to reconnect, the connecting thread must wait for readers to drain before reusing the inpcb's hash chain linkage pointers. raw_ip (ab)uses the hash table without using the corresponding accessors. Since there are now two hash tables, it arbitrarily uses the "connected" table for all of its PCBs. This will be addressed in some way in the future. inp interators which specify a hash bucket will only visit connected PCBs. This is not really correct, but nothing in the tree uses that functionality except raw_ip, which as mentioned above places all of its PCBs in the "connected" table and so is unaffected. Discussed with: glebius Tested by: glebius Sponsored by: Klara, Inc. Sponsored by: Modirum MDPay Differential Revision: https://reviews.freebsd.org/D38569
This commit is contained in:
+126
-53
@@ -58,8 +58,10 @@ __FBSDID("$FreeBSD$");
|
|||||||
#include <sys/mbuf.h>
|
#include <sys/mbuf.h>
|
||||||
#include <sys/eventhandler.h>
|
#include <sys/eventhandler.h>
|
||||||
#include <sys/domain.h>
|
#include <sys/domain.h>
|
||||||
|
#include <sys/proc.h>
|
||||||
#include <sys/protosw.h>
|
#include <sys/protosw.h>
|
||||||
#include <sys/smp.h>
|
#include <sys/smp.h>
|
||||||
|
#include <sys/smr.h>
|
||||||
#include <sys/socket.h>
|
#include <sys/socket.h>
|
||||||
#include <sys/socketvar.h>
|
#include <sys/socketvar.h>
|
||||||
#include <sys/sockio.h>
|
#include <sys/sockio.h>
|
||||||
@@ -510,7 +512,9 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
|
|||||||
#endif
|
#endif
|
||||||
CK_LIST_INIT(&pcbinfo->ipi_listhead);
|
CK_LIST_INIT(&pcbinfo->ipi_listhead);
|
||||||
pcbinfo->ipi_count = 0;
|
pcbinfo->ipi_count = 0;
|
||||||
pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
|
pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
|
||||||
|
&pcbinfo->ipi_hashmask);
|
||||||
|
pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
|
||||||
&pcbinfo->ipi_hashmask);
|
&pcbinfo->ipi_hashmask);
|
||||||
porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
|
porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
|
||||||
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
|
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
|
||||||
@@ -532,7 +536,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
|
|||||||
KASSERT(pcbinfo->ipi_count == 0,
|
KASSERT(pcbinfo->ipi_count == 0,
|
||||||
("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
|
("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
|
||||||
|
|
||||||
hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
|
hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
|
||||||
|
hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
|
||||||
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
|
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
|
||||||
pcbinfo->ipi_porthashmask);
|
pcbinfo->ipi_porthashmask);
|
||||||
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
|
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
|
||||||
@@ -630,6 +635,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
|
|||||||
#ifdef INET
|
#ifdef INET
|
||||||
inp->inp_vflag |= INP_IPV4;
|
inp->inp_vflag |= INP_IPV4;
|
||||||
#endif
|
#endif
|
||||||
|
inp->inp_smr = SMR_SEQ_INVALID;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Routes in inpcb's can cache L2 as well; they are guaranteed
|
* Routes in inpcb's can cache L2 as well; they are guaranteed
|
||||||
* to be cleaned up.
|
* to be cleaned up.
|
||||||
@@ -1010,7 +1017,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
|
|||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred,
|
in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred,
|
||||||
bool rehash)
|
bool rehash __unused)
|
||||||
{
|
{
|
||||||
u_short lport, fport;
|
u_short lport, fport;
|
||||||
in_addr_t laddr, faddr;
|
in_addr_t laddr, faddr;
|
||||||
@@ -1018,6 +1025,8 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred,
|
|||||||
|
|
||||||
INP_WLOCK_ASSERT(inp);
|
INP_WLOCK_ASSERT(inp);
|
||||||
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
|
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
|
||||||
|
KASSERT(in_nullhost(inp->inp_faddr),
|
||||||
|
("%s: inp is already connected", __func__));
|
||||||
|
|
||||||
lport = inp->inp_lport;
|
lport = inp->inp_lport;
|
||||||
laddr = inp->inp_laddr.s_addr;
|
laddr = inp->inp_laddr.s_addr;
|
||||||
@@ -1027,28 +1036,26 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred,
|
|||||||
if (error)
|
if (error)
|
||||||
return (error);
|
return (error);
|
||||||
|
|
||||||
|
inp->inp_faddr.s_addr = faddr;
|
||||||
|
inp->inp_fport = fport;
|
||||||
|
|
||||||
/* Do the initial binding of the local address if required. */
|
/* Do the initial binding of the local address if required. */
|
||||||
if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
|
if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
|
||||||
KASSERT(rehash == true,
|
|
||||||
("Rehashing required for unbound inps"));
|
|
||||||
inp->inp_lport = lport;
|
inp->inp_lport = lport;
|
||||||
inp->inp_laddr.s_addr = laddr;
|
inp->inp_laddr.s_addr = laddr;
|
||||||
if (in_pcbinshash(inp) != 0) {
|
if (in_pcbinshash(inp) != 0) {
|
||||||
inp->inp_laddr.s_addr = INADDR_ANY;
|
inp->inp_laddr.s_addr = inp->inp_faddr.s_addr =
|
||||||
inp->inp_lport = 0;
|
INADDR_ANY;
|
||||||
|
inp->inp_lport = inp->inp_fport = 0;
|
||||||
return (EAGAIN);
|
return (EAGAIN);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/* Commit the remaining changes. */
|
|
||||||
inp->inp_lport = lport;
|
|
||||||
inp->inp_laddr.s_addr = laddr;
|
|
||||||
inp->inp_faddr.s_addr = faddr;
|
|
||||||
inp->inp_fport = fport;
|
|
||||||
if (rehash) {
|
|
||||||
in_pcbrehash(inp);
|
|
||||||
} else {
|
} else {
|
||||||
in_pcbinshash(inp);
|
inp->inp_lport = lport;
|
||||||
|
inp->inp_laddr.s_addr = laddr;
|
||||||
|
if ((inp->inp_flags & INP_INHASHLIST) != 0)
|
||||||
|
in_pcbrehash(inp);
|
||||||
|
else
|
||||||
|
in_pcbinshash(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (anonport)
|
if (anonport)
|
||||||
@@ -1402,11 +1409,16 @@ in_pcbdisconnect(struct inpcb *inp)
|
|||||||
|
|
||||||
INP_WLOCK_ASSERT(inp);
|
INP_WLOCK_ASSERT(inp);
|
||||||
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
|
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
|
||||||
|
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
|
||||||
|
("%s: inp %p was already disconnected", __func__, inp));
|
||||||
|
|
||||||
|
in_pcbremhash_locked(inp);
|
||||||
|
|
||||||
|
/* See the comment in in_pcbinshash(). */
|
||||||
|
inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
|
||||||
inp->inp_laddr.s_addr = INADDR_ANY;
|
inp->inp_laddr.s_addr = INADDR_ANY;
|
||||||
inp->inp_faddr.s_addr = INADDR_ANY;
|
inp->inp_faddr.s_addr = INADDR_ANY;
|
||||||
inp->inp_fport = 0;
|
inp->inp_fport = 0;
|
||||||
in_pcbrehash(inp);
|
|
||||||
}
|
}
|
||||||
#endif /* INET */
|
#endif /* INET */
|
||||||
|
|
||||||
@@ -1551,11 +1563,11 @@ inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
|
|||||||
#define II_LIST_FIRST(ipi, hash) \
|
#define II_LIST_FIRST(ipi, hash) \
|
||||||
(((hash) == INP_ALL_LIST) ? \
|
(((hash) == INP_ALL_LIST) ? \
|
||||||
CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
|
CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
|
||||||
CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
|
CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
|
||||||
#define II_LIST_NEXT(inp, hash) \
|
#define II_LIST_NEXT(inp, hash) \
|
||||||
(((hash) == INP_ALL_LIST) ? \
|
(((hash) == INP_ALL_LIST) ? \
|
||||||
CK_LIST_NEXT((inp), inp_list) : \
|
CK_LIST_NEXT((inp), inp_list) : \
|
||||||
CK_LIST_NEXT((inp), inp_hash))
|
CK_LIST_NEXT((inp), inp_hash_exact))
|
||||||
#define II_LOCK_ASSERT(inp, lock) \
|
#define II_LOCK_ASSERT(inp, lock) \
|
||||||
rw_assert(&(inp)->inp_lock, \
|
rw_assert(&(inp)->inp_lock, \
|
||||||
(lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
|
(lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
|
||||||
@@ -1996,9 +2008,9 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
|
|||||||
* Look for an unconnected (wildcard foreign addr) PCB that
|
* Look for an unconnected (wildcard foreign addr) PCB that
|
||||||
* matches the local address and port we're looking for.
|
* matches the local address and port we're looking for.
|
||||||
*/
|
*/
|
||||||
head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
|
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
|
||||||
pcbinfo->ipi_hashmask)];
|
pcbinfo->ipi_hashmask)];
|
||||||
CK_LIST_FOREACH(inp, head, inp_hash) {
|
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
|
||||||
#ifdef INET6
|
#ifdef INET6
|
||||||
/* XXX inp locking */
|
/* XXX inp locking */
|
||||||
if ((inp->inp_vflag & INP_IPV4) == 0)
|
if ((inp->inp_vflag & INP_IPV4) == 0)
|
||||||
@@ -2178,9 +2190,9 @@ in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
|
|||||||
INP_HASH_LOCK_ASSERT(pcbinfo);
|
INP_HASH_LOCK_ASSERT(pcbinfo);
|
||||||
|
|
||||||
match = NULL;
|
match = NULL;
|
||||||
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport,
|
head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
|
||||||
pcbinfo->ipi_hashmask)];
|
pcbinfo->ipi_hashmask)];
|
||||||
CK_LIST_FOREACH(inp, head, inp_hash) {
|
CK_LIST_FOREACH(inp, head, inp_hash_exact) {
|
||||||
#ifdef INET6
|
#ifdef INET6
|
||||||
/* XXX inp locking */
|
/* XXX inp locking */
|
||||||
if ((inp->inp_vflag & INP_IPV4) == 0)
|
if ((inp->inp_vflag & INP_IPV4) == 0)
|
||||||
@@ -2214,13 +2226,13 @@ in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
|
|||||||
* 3. non-jailed, non-wild.
|
* 3. non-jailed, non-wild.
|
||||||
* 4. non-jailed, wild.
|
* 4. non-jailed, wild.
|
||||||
*/
|
*/
|
||||||
head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
|
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
|
||||||
pcbinfo->ipi_hashmask)];
|
pcbinfo->ipi_hashmask)];
|
||||||
local_wild = local_exact = jail_wild = NULL;
|
local_wild = local_exact = jail_wild = NULL;
|
||||||
#ifdef INET6
|
#ifdef INET6
|
||||||
local_wild_mapped = NULL;
|
local_wild_mapped = NULL;
|
||||||
#endif
|
#endif
|
||||||
CK_LIST_FOREACH(inp, head, inp_hash) {
|
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
|
||||||
bool injail;
|
bool injail;
|
||||||
|
|
||||||
#ifdef INET6
|
#ifdef INET6
|
||||||
@@ -2368,21 +2380,31 @@ in_pcbinshash(struct inpcb *inp)
|
|||||||
struct inpcbporthead *pcbporthash;
|
struct inpcbporthead *pcbporthash;
|
||||||
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
|
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
|
||||||
struct inpcbport *phd;
|
struct inpcbport *phd;
|
||||||
|
uint32_t hash;
|
||||||
|
bool connected;
|
||||||
|
|
||||||
INP_WLOCK_ASSERT(inp);
|
INP_WLOCK_ASSERT(inp);
|
||||||
INP_HASH_WLOCK_ASSERT(pcbinfo);
|
INP_HASH_WLOCK_ASSERT(pcbinfo);
|
||||||
|
|
||||||
KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
|
KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
|
||||||
("in_pcbinshash: INP_INHASHLIST"));
|
("in_pcbinshash: INP_INHASHLIST"));
|
||||||
|
|
||||||
#ifdef INET6
|
#ifdef INET6
|
||||||
if (inp->inp_vflag & INP_IPV6)
|
if (inp->inp_vflag & INP_IPV6) {
|
||||||
pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
|
hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
|
||||||
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
|
inp->inp_fport, pcbinfo->ipi_hashmask);
|
||||||
else
|
connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
|
||||||
|
} else
|
||||||
#endif
|
#endif
|
||||||
pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
|
{
|
||||||
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
|
hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
|
||||||
|
inp->inp_fport, pcbinfo->ipi_hashmask);
|
||||||
|
connected = !in_nullhost(inp->inp_faddr);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (connected)
|
||||||
|
pcbhash = &pcbinfo->ipi_hash_exact[hash];
|
||||||
|
else
|
||||||
|
pcbhash = &pcbinfo->ipi_hash_wild[hash];
|
||||||
|
|
||||||
pcbporthash = &pcbinfo->ipi_porthashbase[
|
pcbporthash = &pcbinfo->ipi_porthashbase[
|
||||||
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
|
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
|
||||||
@@ -2421,66 +2443,117 @@ in_pcbinshash(struct inpcb *inp)
|
|||||||
}
|
}
|
||||||
inp->inp_phd = phd;
|
inp->inp_phd = phd;
|
||||||
CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
|
CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
|
||||||
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
|
|
||||||
|
/*
|
||||||
|
* The PCB may have been disconnected in the past. Before we can safely
|
||||||
|
* make it visible in the hash table, we must wait for all readers which
|
||||||
|
* may be traversing this PCB to finish.
|
||||||
|
*/
|
||||||
|
if (inp->inp_smr != SMR_SEQ_INVALID) {
|
||||||
|
smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
|
||||||
|
inp->inp_smr = SMR_SEQ_INVALID;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (connected)
|
||||||
|
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
|
||||||
|
else
|
||||||
|
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
|
||||||
inp->inp_flags |= INP_INHASHLIST;
|
inp->inp_flags |= INP_INHASHLIST;
|
||||||
|
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
void
|
||||||
in_pcbremhash(struct inpcb *inp)
|
in_pcbremhash_locked(struct inpcb *inp)
|
||||||
{
|
{
|
||||||
struct inpcbport *phd = inp->inp_phd;
|
struct inpcbport *phd = inp->inp_phd;
|
||||||
|
|
||||||
INP_WLOCK_ASSERT(inp);
|
INP_WLOCK_ASSERT(inp);
|
||||||
|
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
|
||||||
MPASS(inp->inp_flags & INP_INHASHLIST);
|
MPASS(inp->inp_flags & INP_INHASHLIST);
|
||||||
|
|
||||||
INP_HASH_WLOCK(inp->inp_pcbinfo);
|
|
||||||
if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
|
if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
|
||||||
in_pcbremlbgrouphash(inp);
|
in_pcbremlbgrouphash(inp);
|
||||||
CK_LIST_REMOVE(inp, inp_hash);
|
#ifdef INET6
|
||||||
|
if (inp->inp_vflag & INP_IPV6) {
|
||||||
|
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
|
||||||
|
CK_LIST_REMOVE(inp, inp_hash_wild);
|
||||||
|
else
|
||||||
|
CK_LIST_REMOVE(inp, inp_hash_exact);
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
if (in_nullhost(inp->inp_faddr))
|
||||||
|
CK_LIST_REMOVE(inp, inp_hash_wild);
|
||||||
|
else
|
||||||
|
CK_LIST_REMOVE(inp, inp_hash_exact);
|
||||||
|
}
|
||||||
CK_LIST_REMOVE(inp, inp_portlist);
|
CK_LIST_REMOVE(inp, inp_portlist);
|
||||||
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
|
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
|
||||||
CK_LIST_REMOVE(phd, phd_hash);
|
CK_LIST_REMOVE(phd, phd_hash);
|
||||||
uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
|
uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
|
||||||
}
|
}
|
||||||
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
|
|
||||||
inp->inp_flags &= ~INP_INHASHLIST;
|
inp->inp_flags &= ~INP_INHASHLIST;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
in_pcbremhash(struct inpcb *inp)
|
||||||
|
{
|
||||||
|
INP_HASH_WLOCK(inp->inp_pcbinfo);
|
||||||
|
in_pcbremhash_locked(inp);
|
||||||
|
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Move PCB to the proper hash bucket when { faddr, fport } have been
|
* Move PCB to the proper hash bucket when { faddr, fport } have been
|
||||||
* changed. NOTE: This does not handle the case of the lport changing (the
|
* changed. NOTE: This does not handle the case of the lport changing (the
|
||||||
* hashed port list would have to be updated as well), so the lport must
|
* hashed port list would have to be updated as well), so the lport must
|
||||||
* not change after in_pcbinshash() has been called.
|
* not change after in_pcbinshash() has been called.
|
||||||
*
|
|
||||||
* XXXGL: a race between this function and SMR-protected hash iterator
|
|
||||||
* will lead to iterator traversing a possibly wrong hash list. However,
|
|
||||||
* this race should have been here since change from rwlock to epoch.
|
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
in_pcbrehash(struct inpcb *inp)
|
in_pcbrehash(struct inpcb *inp)
|
||||||
{
|
{
|
||||||
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
|
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
|
||||||
struct inpcbhead *head;
|
struct inpcbhead *head;
|
||||||
|
uint32_t hash;
|
||||||
|
bool connected;
|
||||||
|
|
||||||
INP_WLOCK_ASSERT(inp);
|
INP_WLOCK_ASSERT(inp);
|
||||||
INP_HASH_WLOCK_ASSERT(pcbinfo);
|
INP_HASH_WLOCK_ASSERT(pcbinfo);
|
||||||
|
|
||||||
KASSERT(inp->inp_flags & INP_INHASHLIST,
|
KASSERT(inp->inp_flags & INP_INHASHLIST,
|
||||||
("in_pcbrehash: !INP_INHASHLIST"));
|
("%s: !INP_INHASHLIST", __func__));
|
||||||
|
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
|
||||||
|
("%s: inp was disconnected", __func__));
|
||||||
|
|
||||||
#ifdef INET6
|
#ifdef INET6
|
||||||
if (inp->inp_vflag & INP_IPV6)
|
if (inp->inp_vflag & INP_IPV6) {
|
||||||
head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
|
hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
|
||||||
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
|
inp->inp_fport, pcbinfo->ipi_hashmask);
|
||||||
else
|
connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
|
||||||
|
} else
|
||||||
#endif
|
#endif
|
||||||
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
|
{
|
||||||
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
|
hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
|
||||||
|
inp->inp_fport, pcbinfo->ipi_hashmask);
|
||||||
|
connected = !in_nullhost(inp->inp_faddr);
|
||||||
|
}
|
||||||
|
|
||||||
CK_LIST_REMOVE(inp, inp_hash);
|
/*
|
||||||
CK_LIST_INSERT_HEAD(head, inp, inp_hash);
|
* When rehashing, the caller must ensure that either the new or the old
|
||||||
|
* foreign address was unspecified.
|
||||||
|
*/
|
||||||
|
if (connected)
|
||||||
|
CK_LIST_REMOVE(inp, inp_hash_wild);
|
||||||
|
else
|
||||||
|
CK_LIST_REMOVE(inp, inp_hash_exact);
|
||||||
|
|
||||||
|
if (connected) {
|
||||||
|
head = &pcbinfo->ipi_hash_exact[hash];
|
||||||
|
CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
|
||||||
|
} else {
|
||||||
|
head = &pcbinfo->ipi_hash_wild[hash];
|
||||||
|
CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|||||||
+10
-6
@@ -45,13 +45,13 @@
|
|||||||
#include <sys/_lock.h>
|
#include <sys/_lock.h>
|
||||||
#include <sys/_mutex.h>
|
#include <sys/_mutex.h>
|
||||||
#include <sys/_rwlock.h>
|
#include <sys/_rwlock.h>
|
||||||
|
#include <sys/_smr.h>
|
||||||
#include <net/route.h>
|
#include <net/route.h>
|
||||||
|
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
#include <sys/lock.h>
|
#include <sys/lock.h>
|
||||||
#include <sys/proc.h>
|
#include <sys/proc.h>
|
||||||
#include <sys/rwlock.h>
|
#include <sys/rwlock.h>
|
||||||
#include <sys/smr.h>
|
|
||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
#include <net/vnet.h>
|
#include <net/vnet.h>
|
||||||
#include <vm/uma.h>
|
#include <vm/uma.h>
|
||||||
@@ -215,7 +215,8 @@ struct inpcbpolicy;
|
|||||||
struct m_snd_tag;
|
struct m_snd_tag;
|
||||||
struct inpcb {
|
struct inpcb {
|
||||||
/* Cache line #1 (amd64) */
|
/* Cache line #1 (amd64) */
|
||||||
CK_LIST_ENTRY(inpcb) inp_hash; /* (w:h/r:e) hash list */
|
CK_LIST_ENTRY(inpcb) inp_hash_exact; /* hash table linkage */
|
||||||
|
CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */
|
||||||
struct rwlock inp_lock;
|
struct rwlock inp_lock;
|
||||||
/* Cache line #2 (amd64) */
|
/* Cache line #2 (amd64) */
|
||||||
#define inp_start_zero inp_hpts
|
#define inp_start_zero inp_hpts
|
||||||
@@ -261,11 +262,12 @@ struct inpcb {
|
|||||||
u_char inp_ip_p; /* (c) protocol proto */
|
u_char inp_ip_p; /* (c) protocol proto */
|
||||||
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
|
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
|
||||||
uint32_t inp_flowid; /* (x) flow id / queue id */
|
uint32_t inp_flowid; /* (x) flow id / queue id */
|
||||||
|
smr_seq_t inp_smr; /* (i) sequence number at disconnect */
|
||||||
struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */
|
struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */
|
||||||
uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
|
uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
|
||||||
|
|
||||||
/* Local and foreign ports, local and foreign addr. */
|
/* Local and foreign ports, local and foreign addr. */
|
||||||
struct in_conninfo inp_inc; /* (i) list for PCB's local port */
|
struct in_conninfo inp_inc; /* (i,h) list for PCB's local port */
|
||||||
|
|
||||||
/* MAC and IPSEC policy information. */
|
/* MAC and IPSEC policy information. */
|
||||||
struct label *inp_label; /* (i) MAC label */
|
struct label *inp_label; /* (i) MAC label */
|
||||||
@@ -430,10 +432,12 @@ struct inpcbinfo {
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Global hash of inpcbs, hashed by local and foreign addresses and
|
* Global hash of inpcbs, hashed by local and foreign addresses and
|
||||||
* port numbers.
|
* port numbers. The "exact" hash holds PCBs connected to a foreign
|
||||||
|
* address, and "wild" holds the rest.
|
||||||
*/
|
*/
|
||||||
struct mtx ipi_hash_lock;
|
struct mtx ipi_hash_lock;
|
||||||
struct inpcbhead *ipi_hashbase; /* (r:e/w:h) */
|
struct inpcbhead *ipi_hash_exact; /* (r:e/w:h) */
|
||||||
|
struct inpcbhead *ipi_hash_wild; /* (r:e/w:h) */
|
||||||
u_long ipi_hashmask; /* (c) */
|
u_long ipi_hashmask; /* (c) */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -643,7 +647,6 @@ int inp_so_options(const struct inpcb *inp);
|
|||||||
#define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */
|
#define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */
|
||||||
#define IN6P_TCLASS 0x00400000 /* receive traffic class value */
|
#define IN6P_TCLASS 0x00400000 /* receive traffic class value */
|
||||||
#define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */
|
#define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */
|
||||||
/* was INP_TIMEWAIT 0x01000000 */
|
|
||||||
#define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */
|
#define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */
|
||||||
#define INP_DROPPED 0x04000000 /* protocol drop flag */
|
#define INP_DROPPED 0x04000000 /* protocol drop flag */
|
||||||
#define INP_SOCKREF 0x08000000 /* strong socket reference */
|
#define INP_SOCKREF 0x08000000 /* strong socket reference */
|
||||||
@@ -760,6 +763,7 @@ void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
|
|||||||
int, struct inpcb *(*)(struct inpcb *, int));
|
int, struct inpcb *(*)(struct inpcb *, int));
|
||||||
void in_pcbref(struct inpcb *);
|
void in_pcbref(struct inpcb *);
|
||||||
void in_pcbrehash(struct inpcb *);
|
void in_pcbrehash(struct inpcb *);
|
||||||
|
void in_pcbremhash_locked(struct inpcb *);
|
||||||
bool in_pcbrele_rlocked(struct inpcb *);
|
bool in_pcbrele_rlocked(struct inpcb *);
|
||||||
bool in_pcbrele_wlocked(struct inpcb *);
|
bool in_pcbrele_wlocked(struct inpcb *);
|
||||||
|
|
||||||
|
|||||||
@@ -166,8 +166,8 @@ rip_inshash(struct inpcb *inp)
|
|||||||
inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
|
inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
|
||||||
} else
|
} else
|
||||||
hash = 0;
|
hash = 0;
|
||||||
pcbhash = &pcbinfo->ipi_hashbase[hash];
|
pcbhash = &pcbinfo->ipi_hash_exact[hash];
|
||||||
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
|
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -177,7 +177,7 @@ rip_delhash(struct inpcb *inp)
|
|||||||
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
|
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
|
||||||
INP_WLOCK_ASSERT(inp);
|
INP_WLOCK_ASSERT(inp);
|
||||||
|
|
||||||
CK_LIST_REMOVE(inp, inp_hash);
|
CK_LIST_REMOVE(inp, inp_hash_exact);
|
||||||
}
|
}
|
||||||
#endif /* INET */
|
#endif /* INET */
|
||||||
|
|
||||||
|
|||||||
+20
-20
@@ -81,7 +81,9 @@ __FBSDID("$FreeBSD$");
|
|||||||
#include <sys/malloc.h>
|
#include <sys/malloc.h>
|
||||||
#include <sys/mbuf.h>
|
#include <sys/mbuf.h>
|
||||||
#include <sys/domain.h>
|
#include <sys/domain.h>
|
||||||
|
#include <sys/proc.h>
|
||||||
#include <sys/protosw.h>
|
#include <sys/protosw.h>
|
||||||
|
#include <sys/smr.h>
|
||||||
#include <sys/socket.h>
|
#include <sys/socket.h>
|
||||||
#include <sys/socketvar.h>
|
#include <sys/socketvar.h>
|
||||||
#include <sys/sockio.h>
|
#include <sys/sockio.h>
|
||||||
@@ -398,7 +400,7 @@ in6_pcbladdr(struct inpcb *inp, struct sockaddr_in6 *sin6,
|
|||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
|
in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
|
||||||
bool rehash)
|
bool rehash __unused)
|
||||||
{
|
{
|
||||||
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
|
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
|
||||||
struct sockaddr_in6 laddr6;
|
struct sockaddr_in6 laddr6;
|
||||||
@@ -411,6 +413,8 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
|
|||||||
("%s: invalid address family for %p", __func__, sin6));
|
("%s: invalid address family for %p", __func__, sin6));
|
||||||
KASSERT(sin6->sin6_len == sizeof(*sin6),
|
KASSERT(sin6->sin6_len == sizeof(*sin6),
|
||||||
("%s: invalid address length for %p", __func__, sin6));
|
("%s: invalid address length for %p", __func__, sin6));
|
||||||
|
KASSERT(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr),
|
||||||
|
("%s: inp is already connected", __func__));
|
||||||
|
|
||||||
bzero(&laddr6, sizeof(laddr6));
|
bzero(&laddr6, sizeof(laddr6));
|
||||||
laddr6.sin6_family = AF_INET6;
|
laddr6.sin6_family = AF_INET6;
|
||||||
@@ -440,17 +444,6 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
|
|||||||
return (EADDRINUSE);
|
return (EADDRINUSE);
|
||||||
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
|
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
|
||||||
if (inp->inp_lport == 0) {
|
if (inp->inp_lport == 0) {
|
||||||
/*
|
|
||||||
* rehash was required to be true in the past for
|
|
||||||
* this case; retain that convention. However,
|
|
||||||
* we now call in_pcb_lport_dest rather than
|
|
||||||
* in6_pcbbind; the former does not insert into
|
|
||||||
* the hash table, the latter does. Change rehash
|
|
||||||
* to false to do the in_pcbinshash below.
|
|
||||||
*/
|
|
||||||
KASSERT(rehash == true,
|
|
||||||
("Rehashing required for unbound inps"));
|
|
||||||
rehash = false;
|
|
||||||
error = in_pcb_lport_dest(inp,
|
error = in_pcb_lport_dest(inp,
|
||||||
(struct sockaddr *) &laddr6, &inp->inp_lport,
|
(struct sockaddr *) &laddr6, &inp->inp_lport,
|
||||||
(struct sockaddr *) sin6, sin6->sin6_port, cred,
|
(struct sockaddr *) sin6, sin6->sin6_port, cred,
|
||||||
@@ -468,7 +461,7 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
|
|||||||
inp->inp_flow |=
|
inp->inp_flow |=
|
||||||
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
|
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
|
||||||
|
|
||||||
if (rehash) {
|
if ((inp->inp_flags & INP_INHASHLIST) != 0) {
|
||||||
in_pcbrehash(inp);
|
in_pcbrehash(inp);
|
||||||
} else {
|
} else {
|
||||||
in_pcbinshash(inp);
|
in_pcbinshash(inp);
|
||||||
@@ -483,13 +476,20 @@ in6_pcbdisconnect(struct inpcb *inp)
|
|||||||
|
|
||||||
INP_WLOCK_ASSERT(inp);
|
INP_WLOCK_ASSERT(inp);
|
||||||
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
|
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
|
||||||
|
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
|
||||||
|
("%s: inp %p was already disconnected", __func__, inp));
|
||||||
|
|
||||||
|
in_pcbremhash_locked(inp);
|
||||||
|
|
||||||
|
/* See the comment in in_pcbinshash(). */
|
||||||
|
inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
|
||||||
|
|
||||||
|
/* XXX-MJ torn writes are visible to SMR lookup */
|
||||||
memset(&inp->in6p_laddr, 0, sizeof(inp->in6p_laddr));
|
memset(&inp->in6p_laddr, 0, sizeof(inp->in6p_laddr));
|
||||||
memset(&inp->in6p_faddr, 0, sizeof(inp->in6p_faddr));
|
memset(&inp->in6p_faddr, 0, sizeof(inp->in6p_faddr));
|
||||||
inp->inp_fport = 0;
|
inp->inp_fport = 0;
|
||||||
/* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
|
/* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
|
||||||
inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
|
inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
|
||||||
in_pcbrehash(inp);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct sockaddr *
|
struct sockaddr *
|
||||||
@@ -712,9 +712,9 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr,
|
|||||||
* Look for an unconnected (wildcard foreign addr) PCB that
|
* Look for an unconnected (wildcard foreign addr) PCB that
|
||||||
* matches the local address and port we're looking for.
|
* matches the local address and port we're looking for.
|
||||||
*/
|
*/
|
||||||
head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
|
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
|
||||||
pcbinfo->ipi_hashmask)];
|
pcbinfo->ipi_hashmask)];
|
||||||
CK_LIST_FOREACH(inp, head, inp_hash) {
|
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
|
||||||
/* XXX inp locking */
|
/* XXX inp locking */
|
||||||
if ((inp->inp_vflag & INP_IPV6) == 0)
|
if ((inp->inp_vflag & INP_IPV6) == 0)
|
||||||
continue;
|
continue;
|
||||||
@@ -952,9 +952,9 @@ in6_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
|
|||||||
* First look for an exact match.
|
* First look for an exact match.
|
||||||
*/
|
*/
|
||||||
match = NULL;
|
match = NULL;
|
||||||
head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(faddr, lport, fport,
|
head = &pcbinfo->ipi_hash_exact[INP6_PCBHASH(faddr, lport, fport,
|
||||||
pcbinfo->ipi_hashmask)];
|
pcbinfo->ipi_hashmask)];
|
||||||
CK_LIST_FOREACH(inp, head, inp_hash) {
|
CK_LIST_FOREACH(inp, head, inp_hash_exact) {
|
||||||
/* XXX inp locking */
|
/* XXX inp locking */
|
||||||
if ((inp->inp_vflag & INP_IPV6) == 0)
|
if ((inp->inp_vflag & INP_IPV6) == 0)
|
||||||
continue;
|
continue;
|
||||||
@@ -982,10 +982,10 @@ in6_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo,
|
|||||||
* 3. non-jailed, non-wild.
|
* 3. non-jailed, non-wild.
|
||||||
* 4. non-jailed, wild.
|
* 4. non-jailed, wild.
|
||||||
*/
|
*/
|
||||||
head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
|
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
|
||||||
pcbinfo->ipi_hashmask)];
|
pcbinfo->ipi_hashmask)];
|
||||||
local_wild = local_exact = jail_wild = NULL;
|
local_wild = local_exact = jail_wild = NULL;
|
||||||
CK_LIST_FOREACH(inp, head, inp_hash) {
|
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
|
||||||
bool injail;
|
bool injail;
|
||||||
|
|
||||||
/* XXX inp locking */
|
/* XXX inp locking */
|
||||||
|
|||||||
Reference in New Issue
Block a user