/*
   ctdb system specific code to manage raw sockets on linux

   Copyright (C) Ronnie Sahlberg  2007
   Copyright (C) Andrew Tridgell  2007

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, see <http://www.gnu.org/licenses/>.
*/

#include "replace.h"

/*
 * Use BSD struct tcphdr field names for portability.  Modern glibc
 * makes them available by default via <netinet/tcp.h> but older glibc
 * requires __FAVOR_BSD to be defined.
 *
 * __FAVOR_BSD is normally defined in <features.h> if _DEFAULT_SOURCE
 * (new) or _BSD_SOURCE (now deprecated) is set and _GNU_SOURCE is not
 * set.  Including "replace.h" above causes <features.h> to be
 * indirectly included and this will not set __FAVOR_BSD because
 * _GNU_SOURCE is set in Samba's "config.h" (which is included by
 * "replace.h").
 *
 * Therefore, set __FAVOR_BSD by hand below.
 */
#define __FAVOR_BSD 1
#include "system/network.h"

#ifdef HAVE_NETINET_IF_ETHER_H
#include <netinet/if_ether.h>
#endif
#ifdef HAVE_NETINET_IP6_H
#include <netinet/ip6.h>
#endif
#ifdef HAVE_NETINET_ICMP6_H
#include <netinet/icmp6.h>
#endif
#ifdef HAVE_LINUX_IF_PACKET_H
#include <linux/if_packet.h>
#endif

#ifndef ETHERTYPE_IP6
#define ETHERTYPE_IP6 0x86dd
#endif

#include "lib/util/debug.h"
#include "lib/util/blocking.h"

#include "protocol/protocol.h"

#include "common/logging.h"
#include "common/system_socket.h"

/*
  uint16 checksum for n bytes
 */
static uint32_t uint16_checksum(uint16_t *data, size_t n)
{
	uint32_t sum=0;
	while (n>=2) {
		sum += (uint32_t)ntohs(*data);
		data++;
		n -= 2;
	}
	if (n == 1) {
		sum += (uint32_t)ntohs(*(uint8_t *)data);
	}
	return sum;
}

/*
 * See if the given IP is currently on an interface
 */
bool ctdb_sys_have_ip(ctdb_sock_addr *_addr)
{
	int s;
	int ret;
	ctdb_sock_addr __addr = *_addr;
	ctdb_sock_addr *addr = &__addr;
	socklen_t addrlen = 0;

	switch (addr->sa.sa_family) {
	case AF_INET:
		addr->ip.sin_port = 0;
		addrlen = sizeof(struct sockaddr_in);
		break;
	case AF_INET6:
		addr->ip6.sin6_port = 0;
		addrlen = sizeof(struct sockaddr_in6);
		break;
	}

	s = socket(addr->sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
	if (s == -1) {
		return false;
	}

	ret = bind(s, (struct sockaddr *)addr, addrlen);

	close(s);
	return ret == 0;
}

static bool parse_ipv4(const char *s, unsigned port, struct sockaddr_in *sin)
{
	sin->sin_family = AF_INET;
	sin->sin_port   = htons(port);

	if (inet_pton(AF_INET, s, &sin->sin_addr) != 1) {
		DBG_ERR("Failed to translate %s into sin_addr\n", s);
		return false;
	}

#ifdef HAVE_SOCK_SIN_LEN
	sin->sin_len = sizeof(*sin);
#endif
	return true;
}

static bool parse_ipv6(const char *s,
		       const char *ifaces,
		       unsigned port,
		       ctdb_sock_addr *saddr)
{
	saddr->ip6.sin6_family   = AF_INET6;
	saddr->ip6.sin6_port     = htons(port);
	saddr->ip6.sin6_flowinfo = 0;
	saddr->ip6.sin6_scope_id = 0;

	if (inet_pton(AF_INET6, s, &saddr->ip6.sin6_addr) != 1) {
		DBG_ERR("Failed to translate %s into sin6_addr\n", s);
		return false;
	}

	if (ifaces && IN6_IS_ADDR_LINKLOCAL(&saddr->ip6.sin6_addr)) {
		if (strchr(ifaces, ',')) {
			DBG_ERR("Link local address %s "
				"is specified for multiple ifaces %s\n",
				s, ifaces);
			return false;
		}
		saddr->ip6.sin6_scope_id = if_nametoindex(ifaces);
	}

#ifdef HAVE_SOCK_SIN6_LEN
	saddr->ip6.sin6_len = sizeof(*saddr);
#endif
	return true;
}

static bool parse_ip(const char *addr,
		     const char *ifaces,
		     unsigned port,
		     ctdb_sock_addr *saddr)
{
	char *p;
	bool ret;

	ZERO_STRUCTP(saddr); /* valgrind :-) */

	/*
	 * IPv4 or IPv6 address?
	 *
	 * Use rindex() because we need the right-most ':' below for
	 * IPv4-mapped IPv6 addresses anyway...
	 */
	p = rindex(addr, ':');
	if (p == NULL) {
		ret = parse_ipv4(addr, port, &saddr->ip);
	} else {
		uint8_t ipv4_mapped_prefix[12] = {
			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff
		};

		ret = parse_ipv6(addr, ifaces, port, saddr);
		if (! ret) {
			return ret;
		}

		/*
		 * Check for IPv4-mapped IPv6 address
		 * (e.g. ::ffff:192.0.2.128) - reparse as IPv4 if
		 * necessary
		 */
		if (memcmp(&saddr->ip6.sin6_addr.s6_addr[0],
			   ipv4_mapped_prefix,
			   sizeof(ipv4_mapped_prefix)) == 0) {
			/* Reparse as IPv4 */
			ret = parse_ipv4(p+1, port, &saddr->ip);
		}
	}

	return ret;
}

/*
 * Parse an ip/mask pair
 */
bool parse_ip_mask(const char *str,
		   const char *ifaces,
		   ctdb_sock_addr *addr,
		   unsigned *mask)
{
	char *p;
	char s[64]; /* Much longer than INET6_ADDRSTRLEN */
	char *endp = NULL;
	ssize_t len;
	bool ret;

	ZERO_STRUCT(*addr);

	len = strlcpy(s, str, sizeof(s));
	if (len >= sizeof(s)) {
		DBG_ERR("Address %s is unreasonably long\n", str);
		return false;
	}

	p = rindex(s, '/');
	if (p == NULL) {
		DBG_ERR("Address %s does not contain a mask\n", s);
		return false;
	}

	*mask = strtoul(p+1, &endp, 10);
	if (endp == NULL || *endp != 0) {
		/* trailing garbage */
		DBG_ERR("Trailing garbage after the mask in %s\n", s);
		return false;
	}
	*p = 0;


	/* now is this a ipv4 or ipv6 address ?*/
	ret = parse_ip(s, ifaces, 0, addr);

	return ret;
}

/*
 * simple TCP checksum - assumes data is multiple of 2 bytes long
 */
static uint16_t ip_checksum(uint16_t *data, size_t n, struct ip *ip)
{
	uint32_t sum = uint16_checksum(data, n);
	uint16_t sum2;

	sum += uint16_checksum((uint16_t *)&ip->ip_src, sizeof(ip->ip_src));
	sum += uint16_checksum((uint16_t *)&ip->ip_dst, sizeof(ip->ip_dst));
	sum += ip->ip_p + n;
	sum = (sum & 0xFFFF) + (sum >> 16);
	sum = (sum & 0xFFFF) + (sum >> 16);
	sum2 = htons(sum);
	sum2 = ~sum2;
	if (sum2 == 0) {
		return 0xFFFF;
	}
	return sum2;
}

static uint16_t ip6_checksum(uint16_t *data, size_t n, struct ip6_hdr *ip6)
{
	uint32_t phdr[2];
	uint32_t sum = 0;
	uint16_t sum2;

	sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
	sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);

	phdr[0] = htonl(n);
	phdr[1] = htonl(ip6->ip6_nxt);
	sum += uint16_checksum((uint16_t *)phdr, 8);

	sum += uint16_checksum(data, n);

	sum = (sum & 0xFFFF) + (sum >> 16);
	sum = (sum & 0xFFFF) + (sum >> 16);
	sum2 = htons(sum);
	sum2 = ~sum2;
	if (sum2 == 0) {
		return 0xFFFF;
	}
	return sum2;
}

/*
 * Send gratuitous ARP request/reply or IPv6 neighbor advertisement
 */

#ifdef HAVE_PACKETSOCKET

int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
{
	int s, ret;
	struct sockaddr_ll sall;
	struct ether_header *eh;
	struct arphdr *ah;
	struct ip6_hdr *ip6;
	struct nd_neighbor_advert *nd_na;
	struct nd_opt_hdr *nd_oh;
	struct ifreq if_hwaddr;
	/* Size of IPv6 neighbor advertisement (with option) */
	unsigned char buffer[sizeof(struct ether_header) +
			     sizeof(struct ip6_hdr) +
			     sizeof(struct nd_neighbor_advert) +
			     sizeof(struct nd_opt_hdr) + ETH_ALEN];
	char *ptr;
	char bdcast[] = {0xff,0xff,0xff,0xff,0xff,0xff};
	struct ifreq ifr;

	ZERO_STRUCT(sall);
	ZERO_STRUCT(ifr);
	ZERO_STRUCT(if_hwaddr);

	switch (addr->ip.sin_family) {
	case AF_INET:
		s = socket(AF_PACKET, SOCK_RAW, 0);
		if (s == -1){
			DBG_ERR("Failed to open raw socket\n");
			return -1;
		}

		DBG_DEBUG("Created SOCKET FD:%d for sending arp\n", s);
		strlcpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
		if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
			DBG_ERR("Interface '%s' not found\n", iface);
			close(s);
			return -1;
		}

		/* get the mac address */
		strlcpy(if_hwaddr.ifr_name, iface, sizeof(if_hwaddr.ifr_name));
		ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
		if ( ret < 0 ) {
			close(s);
			DBG_ERR("ioctl failed\n");
			return -1;
		}
		if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
			D_DEBUG("Ignoring loopback arp request\n");
			close(s);
			return 0;
		}
		if (if_hwaddr.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
			close(s);
			errno = EINVAL;
			DBG_ERR("Not an ethernet address family (0x%x)\n",
				if_hwaddr.ifr_hwaddr.sa_family);
			return -1;
		}


		memset(buffer, 0 , 64);
		eh = (struct ether_header *)buffer;
		memset(eh->ether_dhost, 0xff, ETH_ALEN);
		memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
		eh->ether_type = htons(ETHERTYPE_ARP);

		ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
		ah->ar_hrd = htons(ARPHRD_ETHER);
		ah->ar_pro = htons(ETH_P_IP);
		ah->ar_hln = ETH_ALEN;
		ah->ar_pln = 4;

		/* send a gratious arp */
		ah->ar_op  = htons(ARPOP_REQUEST);
		ptr = (char *)&ah[1];
		memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
		ptr+=ETH_ALEN;
		memcpy(ptr, &addr->ip.sin_addr, 4);
		ptr+=4;
		memset(ptr, 0, ETH_ALEN);
		ptr+=ETH_ALEN;
		memcpy(ptr, &addr->ip.sin_addr, 4);
		ptr+=4;

		sall.sll_family = AF_PACKET;
		sall.sll_halen = 6;
		memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
		sall.sll_protocol = htons(ETH_P_ALL);
		sall.sll_ifindex = ifr.ifr_ifindex;
		ret = sendto(s,buffer, 64, 0,
			     (struct sockaddr *)&sall, sizeof(sall));
		if (ret < 0 ){
			close(s);
			DBG_ERR("Failed sendto\n");
			return -1;
		}

		/* send unsolicited arp reply broadcast */
		ah->ar_op  = htons(ARPOP_REPLY);
		ptr = (char *)&ah[1];
		memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
		ptr+=ETH_ALEN;
		memcpy(ptr, &addr->ip.sin_addr, 4);
		ptr+=4;
		memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
		ptr+=ETH_ALEN;
		memcpy(ptr, &addr->ip.sin_addr, 4);
		ptr+=4;

		ret = sendto(s, buffer, 64, 0,
			     (struct sockaddr *)&sall, sizeof(sall));
		if (ret < 0 ){
			DBG_ERR("Failed sendto\n");
			close(s);
			return -1;
		}

		close(s);
		break;
	case AF_INET6:
		s = socket(AF_PACKET, SOCK_RAW, 0);
		if (s == -1){
			DBG_ERR("Failed to open raw socket\n");
			return -1;
		}

		DBG_DEBUG("Created SOCKET FD:%d for sending arp\n", s);
		strlcpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
		if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
			DBG_ERR("Interface '%s' not found\n", iface);
			close(s);
			return -1;
		}

		/* get the mac address */
		strlcpy(if_hwaddr.ifr_name, iface, sizeof(if_hwaddr.ifr_name));
		ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
		if ( ret < 0 ) {
			close(s);
			DBG_ERR("ioctl failed\n");
			return -1;
		}
		if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
			DBG_DEBUG("Ignoring loopback arp request\n");
			close(s);
			return 0;
		}
		if (if_hwaddr.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
			close(s);
			errno = EINVAL;
			DBG_ERR("Not an ethernet address family (0x%x)\n",
				if_hwaddr.ifr_hwaddr.sa_family);
			return -1;
		}

		memset(buffer, 0 , sizeof(buffer));
		eh = (struct ether_header *)buffer;
		/*
		 * Ethernet multicast: 33:33:00:00:00:01 (see RFC2464,
		 * section 7) - note zeroes above!
		 */
		eh->ether_dhost[0] = eh->ether_dhost[1] = 0x33;
		eh->ether_dhost[5] = 0x01;
		memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
		eh->ether_type = htons(ETHERTYPE_IP6);

		ip6 = (struct ip6_hdr *)(eh+1);
		ip6->ip6_vfc  = 0x60;
		ip6->ip6_plen = htons(sizeof(*nd_na) +
				      sizeof(struct nd_opt_hdr) +
				      ETH_ALEN);
		ip6->ip6_nxt  = IPPROTO_ICMPV6;
		ip6->ip6_hlim = 255;
		ip6->ip6_src  = addr->ip6.sin6_addr;
		/* all-nodes multicast */

		ret = inet_pton(AF_INET6, "ff02::1", &ip6->ip6_dst);
		if (ret != 1) {
			close(s);
			DBG_ERR("Failed inet_pton\n");
			return -1;
		}

		nd_na = (struct nd_neighbor_advert *)(ip6+1);
		nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
		nd_na->nd_na_code = 0;
		nd_na->nd_na_flags_reserved = ND_NA_FLAG_OVERRIDE;
		nd_na->nd_na_target = addr->ip6.sin6_addr;
		/* Option: Target link-layer address */
		nd_oh = (struct nd_opt_hdr *)(nd_na+1);
		nd_oh->nd_opt_type = ND_OPT_TARGET_LINKADDR;
		nd_oh->nd_opt_len = 1;
		memcpy(&(nd_oh+1)[0], if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);

		nd_na->nd_na_cksum = ip6_checksum((uint16_t *)nd_na,
						  ntohs(ip6->ip6_plen), ip6);

		sall.sll_family = AF_PACKET;
		sall.sll_halen = 6;
		memcpy(&sall.sll_addr[0], &eh->ether_dhost[0], sall.sll_halen);
		sall.sll_protocol = htons(ETH_P_ALL);
		sall.sll_ifindex = ifr.ifr_ifindex;
		ret = sendto(s, buffer, sizeof(buffer),
			     0, (struct sockaddr *)&sall, sizeof(sall));
		if (ret < 0 ){
			close(s);
			DBG_ERR("Failed sendto\n");
			return -1;
		}

		close(s);
		break;
	default:
		DBG_ERR("Not an ipv4/ipv6 address (family is %u)\n",
			addr->ip.sin_family);
		return -1;
	}

	return 0;
}

#else /* HAVE_PACKETSOCKET */

int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
{
	/* Not implemented */
	errno = ENOSYS;
	return -1;
}

#endif /* HAVE_PACKETSOCKET */

/*
 * Send tcp segment from the specified IP/port to the specified
 * destination IP/port.
 *
 * This is used to trigger the receiving host into sending its own ACK,
 * which should trigger early detection of TCP reset by the client
 * after IP takeover
 *
 * This can also be used to send RST segments (if rst is true) and also
 * if correct seq and ack numbers are provided.
 */
int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
		      const ctdb_sock_addr *src,
		      uint32_t seq,
		      uint32_t ack,
		      int rst)
{
	int s;
	int ret;
	uint32_t one = 1;
	uint16_t tmpport;
	ctdb_sock_addr *tmpdest;
	struct {
		struct ip ip;
		struct tcphdr tcp;
	} ip4pkt;
	struct {
		struct ip6_hdr ip6;
		struct tcphdr tcp;
	} ip6pkt;
	int saved_errno;

	switch (src->ip.sin_family) {
	case AF_INET:
		ZERO_STRUCT(ip4pkt);
		ip4pkt.ip.ip_v     = 4;
		ip4pkt.ip.ip_hl    = sizeof(ip4pkt.ip)/4;
		ip4pkt.ip.ip_len   = htons(sizeof(ip4pkt));
		ip4pkt.ip.ip_ttl   = 255;
		ip4pkt.ip.ip_p     = IPPROTO_TCP;
		ip4pkt.ip.ip_src.s_addr    = src->ip.sin_addr.s_addr;
		ip4pkt.ip.ip_dst.s_addr    = dest->ip.sin_addr.s_addr;
		ip4pkt.ip.ip_sum   = 0;

		ip4pkt.tcp.th_sport = src->ip.sin_port;
		ip4pkt.tcp.th_dport = dest->ip.sin_port;
		ip4pkt.tcp.th_seq   = seq;
		ip4pkt.tcp.th_ack   = ack;
		ip4pkt.tcp.th_flags = 0;
		ip4pkt.tcp.th_flags |= TH_ACK;
		if (rst) {
			ip4pkt.tcp.th_flags |= TH_RST;
		}
		ip4pkt.tcp.th_off   = sizeof(ip4pkt.tcp)/4;
		/* this makes it easier to spot in a sniffer */
		ip4pkt.tcp.th_win   = htons(1234);
		ip4pkt.tcp.th_sum   = ip_checksum((uint16_t *)&ip4pkt.tcp,
						  sizeof(ip4pkt.tcp),
						  &ip4pkt.ip);

		/* open a raw socket to send this segment from */
		s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
		if (s == -1) {
			DBG_ERR("Failed to open raw socket (%s)\n",
				strerror(errno));
			return -1;
		}

		ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
		if (ret != 0) {
			DBG_ERR("Failed to setup IP headers (%s)\n",
				strerror(errno));
			close(s);
			return -1;
		}

		ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0,
			     (const struct sockaddr *)&dest->ip,
			     sizeof(dest->ip));
		saved_errno = errno;
		close(s);
		if (ret != sizeof(ip4pkt)) {
			D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
			return -1;
		}
		break;
	case AF_INET6:
		ZERO_STRUCT(ip6pkt);
		ip6pkt.ip6.ip6_vfc  = 0x60;
		ip6pkt.ip6.ip6_plen = htons(20);
		ip6pkt.ip6.ip6_nxt  = IPPROTO_TCP;
		ip6pkt.ip6.ip6_hlim = 64;
		ip6pkt.ip6.ip6_src  = src->ip6.sin6_addr;
		ip6pkt.ip6.ip6_dst  = dest->ip6.sin6_addr;

		ip6pkt.tcp.th_sport = src->ip6.sin6_port;
		ip6pkt.tcp.th_dport = dest->ip6.sin6_port;
		ip6pkt.tcp.th_seq   = seq;
		ip6pkt.tcp.th_ack   = ack;
		ip6pkt.tcp.th_flags = 0;
		ip6pkt.tcp.th_flags |= TH_RST;
		if (rst) {
			ip6pkt.tcp.th_flags |= TH_RST;
		}
		ip6pkt.tcp.th_off    = sizeof(ip6pkt.tcp)/4;
		/* this makes it easier to spot in a sniffer */
		ip6pkt.tcp.th_win   = htons(1234);
		ip6pkt.tcp.th_sum   = ip6_checksum((uint16_t *)&ip6pkt.tcp,
						   sizeof(ip6pkt.tcp),
						   &ip6pkt.ip6);

		s = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
		if (s == -1) {
			DBG_ERR("Failed to open sending socket\n");
			return -1;

		}
		/* sendto() don't like if the port is set and the socket is
		   in raw mode.
		*/
		tmpdest = discard_const(dest);
		tmpport = tmpdest->ip6.sin6_port;

		tmpdest->ip6.sin6_port = 0;
		ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0,
			     (const struct sockaddr *)&dest->ip6,
			     sizeof(dest->ip6));
		saved_errno = errno;
		tmpdest->ip6.sin6_port = tmpport;
		close(s);

		if (ret != sizeof(ip6pkt)) {
			D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
			return -1;
		}
		break;

	default:
		DBG_ERR("Not an ipv4/v6 address\n");
		return -1;
	}

	return 0;
}

/*
 * Packet capture
 *
 * If AF_PACKET is available then use a raw socket otherwise use pcap.
 * wscript has checked to make sure that pcap is available if needed.
 */

#ifdef HAVE_AF_PACKET

/*
 * This function is used to open a raw socket to capture from
 */
int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
{
	int s, ret;

	/* Open a socket to capture all traffic */
	s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
	if (s == -1) {
		DBG_ERR("Failed to open raw socket\n");
		return -1;
	}

	DBG_DEBUG("Created RAW SOCKET FD:%d for tcp tickle\n", s);

	ret = set_blocking(s, false);
	if (ret != 0) {
		DBG_ERR("Failed to set socket non-blocking (%s)\n",
			strerror(errno));
		close(s);
		return -1;
	}

	set_close_on_exec(s);

	return s;
}

/*
 * This function is used to do any additional cleanup required when closing
 * a capture socket.
 * Note that the socket itself is closed automatically in the caller.
 */
int ctdb_sys_close_capture_socket(void *private_data)
{
	return 0;
}


/*
 * called when the raw socket becomes readable
 */
int ctdb_sys_read_tcp_packet(int s, void *private_data,
			     ctdb_sock_addr *src,
			     ctdb_sock_addr *dst,
			     uint32_t *ack_seq,
			     uint32_t *seq,
			     int *rst,
			     uint16_t *window)
{
	int ret;
#define RCVPKTSIZE 100
	char pkt[RCVPKTSIZE];
	struct ether_header *eth;
	struct iphdr *ip;
	struct ip6_hdr *ip6;
	struct tcphdr *tcp;

	ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
	if (ret < sizeof(*eth)+sizeof(*ip)) {
		return -1;
	}

	ZERO_STRUCTP(src);
	ZERO_STRUCTP(dst);

	/* Ethernet */
	eth = (struct ether_header *)pkt;

	/* we want either IPv4 or IPv6 */
	if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
		/* IP */
		ip = (struct iphdr *)(eth+1);

		/* We only want IPv4 packets */
		if (ip->version != 4) {
			return -1;
		}
		/* Dont look at fragments */
		if ((ntohs(ip->frag_off)&0x1fff) != 0) {
			return -1;
		}
		/* we only want TCP */
		if (ip->protocol != IPPROTO_TCP) {
			return -1;
		}

		/* make sure its not a short packet */
		if (offsetof(struct tcphdr, th_ack) + 4 +
		    (ip->ihl*4) + sizeof(*eth) > ret) {
			return -1;
		}
		/* TCP */
		tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);

		/* tell the caller which one we've found */
		src->ip.sin_family      = AF_INET;
		src->ip.sin_addr.s_addr = ip->saddr;
		src->ip.sin_port        = tcp->th_sport;
		dst->ip.sin_family      = AF_INET;
		dst->ip.sin_addr.s_addr = ip->daddr;
		dst->ip.sin_port        = tcp->th_dport;
		*ack_seq                = tcp->th_ack;
		*seq                    = tcp->th_seq;
		if (window != NULL) {
			*window = tcp->th_win;
		}
		if (rst != NULL) {
			*rst = tcp->th_flags & TH_RST;
		}

		return 0;
	} else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
		/* IP6 */
		ip6 = (struct ip6_hdr *)(eth+1);

		/* we only want TCP */
		if (ip6->ip6_nxt != IPPROTO_TCP) {
			return -1;
		}

		/* TCP */
		tcp = (struct tcphdr *)(ip6+1);

		/* tell the caller which one we've found */
		src->ip6.sin6_family = AF_INET6;
		src->ip6.sin6_port   = tcp->th_sport;
		src->ip6.sin6_addr   = ip6->ip6_src;

		dst->ip6.sin6_family = AF_INET6;
		dst->ip6.sin6_port   = tcp->th_dport;
		dst->ip6.sin6_addr   = ip6->ip6_dst;

		*ack_seq             = tcp->th_ack;
		*seq                 = tcp->th_seq;
		if (window != NULL) {
			*window = tcp->th_win;
		}
		if (rst != NULL) {
			*rst = tcp->th_flags & TH_RST;
		}

		return 0;
	}

	return -1;
}

#else /* HAVE_AF_PACKET */

#include <pcap.h>

int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
{
	pcap_t *pt;

	pt=pcap_open_live(iface, 100, 0, 0, NULL);
	if (pt == NULL) {
		DBG_ERR("Failed to open capture device %s\n", iface);
		return -1;
	}
	*((pcap_t **)private_data) = pt;

	return pcap_fileno(pt);
}

int ctdb_sys_close_capture_socket(void *private_data)
{
	pcap_t *pt = (pcap_t *)private_data;
	pcap_close(pt);
	return 0;
}

int ctdb_sys_read_tcp_packet(int s,
			     void *private_data,
			     ctdb_sock_addr *src,
			     ctdb_sock_addr *dst,
			     uint32_t *ack_seq,
			     uint32_t *seq,
			     int *rst,
			     uint16_t *window)
{
	int ret;
	struct ether_header *eth;
	struct ip *ip;
	struct ip6_hdr *ip6;
	struct tcphdr *tcp;
	struct ctdb_killtcp_connection *conn;
	struct pcap_pkthdr pkthdr;
	const u_char *buffer;
	pcap_t *pt = (pcap_t *)private_data;

	buffer=pcap_next(pt, &pkthdr);
	if (buffer==NULL) {
		return -1;
	}

	ZERO_STRUCTP(src);
	ZERO_STRUCTP(dst);

	/* Ethernet */
	eth = (struct ether_header *)buffer;

	/* we want either IPv4 or IPv6 */
	if (eth->ether_type == htons(ETHERTYPE_IP)) {
		/* IP */
		ip = (struct ip *)(eth+1);

		/* We only want IPv4 packets */
		if (ip->ip_v != 4) {
			return -1;
		}
		/* Dont look at fragments */
		if ((ntohs(ip->ip_off)&0x1fff) != 0) {
			return -1;
		}
		/* we only want TCP */
		if (ip->ip_p != IPPROTO_TCP) {
			return -1;
		}

		/* make sure its not a short packet */
		if (offsetof(struct tcphdr, th_ack) + 4 +
		    (ip->ip_hl*4) > pkthdr.len) {
			return -1;
		}
		/* TCP */
		tcp = (struct tcphdr *)((ip->ip_hl*4) + (char *)ip);

		/* tell the caller which one we've found */
		src->ip.sin_family      = AF_INET;
		src->ip.sin_addr.s_addr = ip->ip_src.s_addr;
		src->ip.sin_port        = tcp->th_sport;
		dst->ip.sin_family      = AF_INET;
		dst->ip.sin_addr.s_addr = ip->ip_dst.s_addr;
		dst->ip.sin_port        = tcp->th_dport;
		*ack_seq                = tcp->th_ack;
		*seq                    = tcp->th_seq;
		if (window != NULL) {
			*window = tcp->th_win;
		}
		if (rst != NULL) {
			*rst = tcp->th_flags & TH_RST;
		}

		return 0;
	} else if (eth->ether_type == htons(ETHERTYPE_IP6)) {
			/* IP6 */
		ip6 = (struct ip6_hdr *)(eth+1);

		/* we only want TCP */
		if (ip6->ip6_nxt != IPPROTO_TCP) {
			return -1;
		}

		/* TCP */
		tcp = (struct tcphdr *)(ip6+1);

		/* tell the caller which one we've found */
		src->ip6.sin6_family = AF_INET6;
		src->ip6.sin6_port   = tcp->th_sport;
		src->ip6.sin6_addr   = ip6->ip6_src;

		dst->ip6.sin6_family = AF_INET6;
		dst->ip6.sin6_port   = tcp->th_dport;
		dst->ip6.sin6_addr   = ip6->ip6_dst;

		*ack_seq             = tcp->th_ack;
		*seq                 = tcp->th_seq;
		if (window != NULL) {
			*window = tcp->th_win;
		}
		if (rst != NULL) {
			*rst = tcp->th_flags & TH_RST;
		}

		return 0;
	}

	return -1;
}

#endif /* HAVE_AF_PACKET */
