bind not able to query some servers / domains

We have been experiencing problems with our company's DNS server when trying to resolve only certain domains, we are running BIND 9.8.2rc1-RedHat-9.8.2-0.30.rc1.el6 on a CentOS 6.5 server. We are autoritative for some zones and our internal clients and mail system resolve using this server. One of the domains we are having trouble resolving is www.dhl.com, here's what we get when querying using dig:

[root@serverx etc] dig www.dhl.com

; <<>> DiG 9.8.2rc1-RedHat-9.8.2-0.30.rc1.el6 <<>> www.dhl.com
;; global options: +cmd
;; connection timed out; no servers could be reached

and

[root@serverx etc]# dig +trace www.dhl.com

; <<>> DiG 9.8.2rc1-RedHat-9.8.2-0.30.rc1.el6 <<>> +trace www.dhl.com
;; global options: +cmd
.           517419  IN  NS  g.root-servers.net.
.           517419  IN  NS  a.root-servers.net.
.           517419  IN  NS  h.root-servers.net.
.           517419  IN  NS  m.root-servers.net.
.           517419  IN  NS  f.root-servers.net.
.           517419  IN  NS  b.root-servers.net.
.           517419  IN  NS  l.root-servers.net.
.           517419  IN  NS  j.root-servers.net.
.           517419  IN  NS  k.root-servers.net.
.           517419  IN  NS  e.root-servers.net.
.           517419  IN  NS  i.root-servers.net.
.           517419  IN  NS  d.root-servers.net.
.           517419  IN  NS  c.root-servers.net.
;; Received 496 bytes from 192.168.X.X#53(192.168.X.X) in 11 ms

com.            172800  IN  NS  a.gtld-servers.net.
com.            172800  IN  NS  d.gtld-servers.net.
com.            172800  IN  NS  k.gtld-servers.net.
com.            172800  IN  NS  b.gtld-servers.net.
com.            172800  IN  NS  j.gtld-servers.net.
com.            172800  IN  NS  c.gtld-servers.net.
com.            172800  IN  NS  h.gtld-servers.net.
com.            172800  IN  NS  l.gtld-servers.net.
com.            172800  IN  NS  e.gtld-servers.net.
com.            172800  IN  NS  g.gtld-servers.net.
com.            172800  IN  NS  m.gtld-servers.net.
com.            172800  IN  NS  f.gtld-servers.net.
com.            172800  IN  NS  i.gtld-servers.net.
;; Received 489 bytes from 202.12.27.33#53(202.12.27.33) in 6128 ms

dhl.com.        172800  IN  NS  ns4.dhl.com.
dhl.com.        172800  IN  NS  ns6.dhl.com.
dig: couldn't get address for 'ns4.dhl.com': no more

And when I do dig using google's dns server:

dig @8.8.4.4 www.dhl.com

; <<>> DiG 9.8.2rc1-RedHat-9.8.2-0.30.rc1.el6 <<>> @8.8.4.4 www.dhl.com
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 11325
;; flags: qr rd ra; QUERY: 1, ANSWER: 4, AUTHORITY: 0, ADDITIONAL: 0

;; QUESTION SECTION:
;www.dhl.com.           IN  A

;; ANSWER SECTION:
www.dhl.com.        1619    IN  CNAME   ngw.dhl.com.edgesuite.net.
ngw.dhl.com.edgesuite.net. 8520 IN  CNAME   a1085.g.akamai.net.
a1085.g.akamai.net. 19  IN  A   23.74.2.113
a1085.g.akamai.net. 19  IN  A   23.74.2.120

;; Query time: 229 msec
;; SERVER: 8.8.4.4#53(8.8.4.4)
;; WHEN: Thu Dec  4 14:47:56 2014
;; MSG SIZE  rcvd: 129

No problem!, and from the same server...

When you look to "/var/log/messages" nothing gets logged!!!. Again, this only happens with certain domains and this server was working ok a couple of days ago, we also have disabled selinux for testing purposes.

This is our named.conf file (named is runnig in a chroot environment):

options {
//      listen-on port 53 { 127.0.0.1;192.168.xx.x; };
//      listen-on-v6 port 53 { ::1; };
        directory       "/var/named";
        dump-file       "/var/named/data/cache_dump.db";
        statistics-file "/var/named/data/named_stats.txt";
        memstatistics-file "/var/named/data/named_mem_stats.txt";
//      allow-query     { localhost; };
        allow-query     { any; };
        recursion yes;
        allow-recursion { recursive-clients; };
//      query-source address * port 53;
//      dnssec-enable yes;
        dnssec-enable no;
//      dnssec-validation yes;
        dnssec-validation no;
        dnssec-lookaside auto;

        /* Path to ISC DLV key */
        bindkeys-file "/etc/named.iscdlv.key";

        managed-keys-directory "/var/named/dynamic";
        allow-transfer { xxx.x.x.x; xxx.x.x.x; xxx.x.x.x; 127.0.0.1; };
        allow-update { 192.168.xx.xx; };
//        forwarders { 8.8.4.4; };
};

acl recursive-clients { xxx.x.x.x/24; 127.0.0.1; xxx.xxx.xx.x/24; xx.xxx.xxx.xxx/29; xxx.xxx.xx.x;};

logging {
        channel default_debug {
                file "data/named.run";
                severity dynamic;
        };

};

zone "." IN {
        type hint;
        file "named.ca";
};


zone "domain.com.xx" IN {
        type master;
        file "db.domain.com.xx";
        allow-transfer { xxx.xxx.xx.xx; xxx.xxx.xxx.x; };
};

zone "xx.xxx.xxx.in-addr.arpa" IN {
        type master;
        file "db.xxx.xxx.xx";
};

include "/etc/named.rfc1912.zones";
include "/etc/named.root.key";

Any idea guys???... I've been doing tests and researching since yesterday and I cannot figure what is happening!!!

Thanks in advance for any help or idea.

Here it the result using dig +trace +additional www.dhl.com:

dig +trace +additional www.dhl.com

; <<>> DiG 9.8.2rc1-RedHat-9.8.2-0.30.rc1.el6 <<>> +trace +additional www.dhl.com
;; global options: +cmd
.           518340  IN  NS  h.root-servers.net.
.           518340  IN  NS  l.root-servers.net.
.           518340  IN  NS  e.root-servers.net.
.           518340  IN  NS  k.root-servers.net.
.           518340  IN  NS  i.root-servers.net.
.           518340  IN  NS  m.root-servers.net.
.           518340  IN  NS  b.root-servers.net.
.           518340  IN  NS  c.root-servers.net.
.           518340  IN  NS  g.root-servers.net.
.           518340  IN  NS  f.root-servers.net.
.           518340  IN  NS  d.root-servers.net.
.           518340  IN  NS  a.root-servers.net.
.           518340  IN  NS  j.root-servers.net.
k.root-servers.net. 518345  IN  A   193.0.14.129
k.root-servers.net. 518345  IN  AAAA    2001:7fd::1
b.root-servers.net. 518345  IN  A   192.228.79.201
b.root-servers.net. 518345  IN  AAAA    2001:500:84::b
c.root-servers.net. 518345  IN  A   192.33.4.12
c.root-servers.net. 518345  IN  AAAA    2001:500:2::c
i.root-servers.net. 518345  IN  A   192.36.148.17
i.root-servers.net. 518345  IN  AAAA    2001:7fe::53
f.root-servers.net. 518345  IN  A   192.5.5.241
f.root-servers.net. 518345  IN  AAAA    2001:500:2f::f
h.root-servers.net. 518345  IN  A   128.63.2.53
h.root-servers.net. 518345  IN  AAAA    2001:500:1::803f:235
a.root-servers.net. 518345  IN  A   198.41.0.4
;; Received 508 bytes from 192.168.x.x#53(192.168.x.x) in 11363 ms

com.            172800  IN  NS  d.gtld-servers.net.
com.            172800  IN  NS  l.gtld-servers.net.
com.            172800  IN  NS  b.gtld-servers.net.
com.            172800  IN  NS  m.gtld-servers.net.
com.            172800  IN  NS  c.gtld-servers.net.
com.            172800  IN  NS  k.gtld-servers.net.
com.            172800  IN  NS  i.gtld-servers.net.
com.            172800  IN  NS  a.gtld-servers.net.
com.            172800  IN  NS  f.gtld-servers.net.
com.            172800  IN  NS  g.gtld-servers.net.
com.            172800  IN  NS  j.gtld-servers.net.
com.            172800  IN  NS  h.gtld-servers.net.
com.            172800  IN  NS  e.gtld-servers.net.
a.gtld-servers.net. 172800  IN  A   192.5.6.30
b.gtld-servers.net. 172800  IN  A   192.33.14.30
c.gtld-servers.net. 172800  IN  A   192.26.92.30
d.gtld-servers.net. 172800  IN  A   192.31.80.30
e.gtld-servers.net. 172800  IN  A   192.12.94.30
f.gtld-servers.net. 172800  IN  A   192.35.51.30
g.gtld-servers.net. 172800  IN  A   192.42.93.30
h.gtld-servers.net. 172800  IN  A   192.54.112.30
i.gtld-servers.net. 172800  IN  A   192.43.172.30
j.gtld-servers.net. 172800  IN  A   192.48.79.30
k.gtld-servers.net. 172800  IN  A   192.52.178.30
l.gtld-servers.net. 172800  IN  A   192.41.162.30
m.gtld-servers.net. 172800  IN  A   192.55.83.30
a.gtld-servers.net. 172800  IN  AAAA    2001:503:a83e::2:30
;; Received 489 bytes from 202.12.27.33#53(202.12.27.33) in 4335 ms

dhl.com.        172800  IN  NS  ns4.dhl.com.
dhl.com.        172800  IN  NS  ns6.dhl.com.
ns4.dhl.com.        172800  IN  A   165.72.192.16
ns6.dhl.com.        172800  IN  A   199.40.254.166
dig: couldn't get address for 'ns4.dhl.com': no more

Output from dig +tcp www.redhat.com

dig +tcp www.redhat.com

; <<>> DiG 9.8.2rc1-RedHat-9.8.2-0.30.rc1.el6 <<>> +tcp www.redhat.com
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 62110
;; flags: qr rd ra; QUERY: 1, ANSWER: 4, AUTHORITY: 8, ADDITIONAL: 8

;; QUESTION SECTION:
;www.redhat.com.            IN  A

;; ANSWER SECTION:
www.redhat.com.     11  IN  CNAME   wildcard.redhat.com.edgekey.net.
wildcard.redhat.com.edgekey.net. 20484 IN CNAME wildcard.redhat.com.edgekey.net.globalredir.akadns.net.
wildcard.redhat.com.edgekey.net.globalredir.akadns.net. 2485 IN CNAME e1890.b.akamaiedge.net.
e1890.b.akamaiedge.net. 20  IN  A   172.229.164.152

;; AUTHORITY SECTION:
b.akamaiedge.net.   2874    IN  NS  n2b.akamaiedge.net.
b.akamaiedge.net.   2874    IN  NS  n3b.akamaiedge.net.
b.akamaiedge.net.   2874    IN  NS  n4b.akamaiedge.net.
b.akamaiedge.net.   2874    IN  NS  n1b.akamaiedge.net.
b.akamaiedge.net.   2874    IN  NS  n7b.akamaiedge.net.
b.akamaiedge.net.   2874    IN  NS  n5b.akamaiedge.net.
b.akamaiedge.net.   2874    IN  NS  n6b.akamaiedge.net.
b.akamaiedge.net.   2874    IN  NS  n0b.akamaiedge.net.

;; ADDITIONAL SECTION:
n5b.akamaiedge.net. 6917    IN  A   201.144.215.107
n1b.akamaiedge.net. 4917    IN  A   23.61.206.74
n6b.akamaiedge.net. 2917    IN  A   201.144.215.108
n2b.akamaiedge.net. 6917    IN  A   192.204.11.244
n4b.akamaiedge.net. 4917    IN  A   201.144.215.110
n7b.akamaiedge.net. 4917    IN  A   201.144.215.113
n0b.akamaiedge.net. 2917    IN  A   23.61.206.68
n3b.akamaiedge.net. 2917    IN  A   201.144.215.114

;; Query time: 132 msec
;; SERVER: 192.168.x.x#53(192.168.x.x)
;; WHEN: Thu Dec  4 16:03:03 2014
;; MSG SIZE  rcvd: 463

Other tests:

traceroute -U -p 53 165.72.192.16
traceroute to 165.72.192.16 (165.72.192.16), 30 hops max, 60 byte packets
 1  192.168.17.10 (192.168.17.10)  0.724 ms  0.718 ms  0.681 ms
 2  168.243.205.74 (168.243.205.74)  4.188 ms  4.677 ms  3.945 ms
 3  172.26.64.21 (172.26.64.21)  179.831 ms  180.282 ms  181.633 ms
 4  172.24.0.13 (172.24.0.13)  182.433 ms  182.585 ms  179.870 ms
 5  172.24.0.9 (172.24.0.9)  180.654 ms  183.023 ms  180.876 ms
 6  * * *
 7  * * *
 8  * * *
 9  * * *
10  * * *

And:

ping 165.72.192.16
PING 165.72.192.16 (165.72.192.16) 56(84) bytes of data.
64 bytes from 165.72.192.16: icmp_seq=1 ttl=240 time=356 ms
64 bytes from 165.72.192.16: icmp_seq=2 ttl=240 time=325 ms
64 bytes from 165.72.192.16: icmp_seq=3 ttl=240 time=291 ms
64 bytes from 165.72.192.16: icmp_seq=4 ttl=240 time=260 ms

traceroute -U -p 53 199.40.254.166
traceroute to 199.40.254.166 (199.40.254.166), 30 hops max, 60 byte packets
 1  192.168.17.10 (192.168.17.10)  0.710 ms  0.683 ms  0.764 ms
 2  168.243.205.74 (168.243.205.74)  3.136 ms  3.875 ms  4.191 ms
 3  172.26.64.21 (172.26.64.21)  19.367 ms  18.988 ms  19.698 ms
 4  172.24.0.177 (172.24.0.177)  4.657 ms  6.608 ms  7.088 ms
 5  172.24.0.9 (172.24.0.9)  5.126 ms  7.412 ms  5.518 ms
 6  * * *
 7  * * *
 8  * * *
 9  * * *

ping 199.40.254.166
PING 199.40.254.166 (199.40.254.166) 56(84) bytes of data.
64 bytes from 199.40.254.166: icmp_seq=1 ttl=237 time=287 ms
64 bytes from 199.40.254.166: icmp_seq=2 ttl=237 time=280 ms
64 bytes from 199.40.254.166: icmp_seq=3 ttl=237 time=286 ms

UPDATE - 5 Dic 2014

Well, I have installed bind in another server on the same subnet, same OS version and same bind release, and it works just fine!!!, the only thing that changes is the IP address and I can't change that to test because the server with the problem is in production... so I think the IDS theory that Andrew sugests is true... I'll talk to our ISP and investigate our external IP if its blacklisted and post how it goes...

UPDATE - 6 Dic 2014

The server's IP address is not listed in any black list that I checked on the Internet...


Solution 1:

Rerun your dig +trace with the +additional flag. I expect it to still fail, but I'll explain what's happening.

  • +trace with +additional will display the nameserver glue (the ADDITIONAL section), which will at least tell you that the TLD nameservers are properly returning the IP address of ns4.dhl.com.
  • The couldn't get address for 'ns4.dhl.com': no more failure is coming from your local nameserver. This is an obscure detail, but dig +trace does not use data from the ADDITIONAL section to determine the IP address of the "next hop" nameserver. It is actually querying your nameserver in /etc/resolv.conf to obtain the IP address of ns4.dhl.com, and this query is failing.

Based on the result of the dig commands I had you run in the comments, there appears to be a problem communicating with DHL's nameservers on port 53. A UDP based traceroute on port 53 (traceroute -U -p 53) will give you a better idea of where the packet is being lost. It's either a device on your network or DHL's losing the query/reply. In the latter case, your nameserver's IP address may have somehow gotten itself into a reputation database for an IDS device.

Solution 2:

Have you checked your firewall is not blocking port 53 TCP - It looks to me that the dhl.com zone is signed and is thus quite large - for large requests DNS falls back from UDP to TCP, and that might explain your problem.