| /* |
| * |
| * Copyright 2014, Google Inc. |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| */ |
| |
| #include "src/core/endpoint/tcp.h" |
| |
| #include <errno.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/types.h> |
| #include <sys/socket.h> |
| #include <unistd.h> |
| |
| #include "src/core/eventmanager/em.h" |
| #include <grpc/support/alloc.h> |
| #include <grpc/support/log.h> |
| #include <grpc/support/slice.h> |
| #include <grpc/support/string.h> |
| #include <grpc/support/sync.h> |
| #include <grpc/support/time.h> |
| |
| /* Holds a slice array and associated state. */ |
| typedef struct grpc_tcp_slice_state { |
| gpr_slice *slices; /* Array of slices */ |
| size_t nslices; /* Size of slices array. */ |
| ssize_t first_slice; /* First valid slice in array */ |
| ssize_t last_slice; /* Last valid slice in array */ |
| gpr_slice working_slice; /* pointer to original final slice */ |
| int working_slice_valid; /* True if there is a working slice */ |
| int memory_owned; /* True if slices array is owned */ |
| } grpc_tcp_slice_state; |
| |
| static void slice_state_init(grpc_tcp_slice_state *state, gpr_slice *slices, |
| size_t nslices, size_t valid_slices) { |
| state->slices = slices; |
| state->nslices = nslices; |
| if (valid_slices == 0) { |
| state->first_slice = -1; |
| } else { |
| state->first_slice = 0; |
| } |
| state->last_slice = valid_slices - 1; |
| state->working_slice_valid = 0; |
| state->memory_owned = 0; |
| } |
| |
| /* Returns true if there is still available data */ |
| static int slice_state_has_available(grpc_tcp_slice_state *state) { |
| return state->first_slice != -1 && state->last_slice >= state->first_slice; |
| } |
| |
| static ssize_t slice_state_slices_allocated(grpc_tcp_slice_state *state) { |
| if (state->first_slice == -1) { |
| return 0; |
| } else { |
| return state->last_slice - state->first_slice + 1; |
| } |
| } |
| |
| static void slice_state_realloc(grpc_tcp_slice_state *state, size_t new_size) { |
| /* TODO(klempner): use realloc instead when first_slice is 0 */ |
| /* TODO(klempner): Avoid a realloc in cases where it is unnecessary */ |
| gpr_slice *slices = state->slices; |
| size_t original_size = slice_state_slices_allocated(state); |
| size_t i; |
| gpr_slice *new_slices = gpr_malloc(sizeof(gpr_slice) * new_size); |
| |
| for (i = 0; i < original_size; ++i) { |
| new_slices[i] = slices[i + state->first_slice]; |
| } |
| |
| state->slices = new_slices; |
| state->last_slice = original_size - 1; |
| if (original_size > 0) { |
| state->first_slice = 0; |
| } else { |
| state->first_slice = -1; |
| } |
| state->nslices = new_size; |
| |
| if (state->memory_owned) { |
| gpr_free(slices); |
| } |
| state->memory_owned = 1; |
| } |
| |
| static void slice_state_remove_prefix(grpc_tcp_slice_state *state, |
| size_t prefix_bytes) { |
| gpr_slice *current_slice = &state->slices[state->first_slice]; |
| size_t current_slice_size; |
| |
| while (slice_state_has_available(state)) { |
| current_slice_size = GPR_SLICE_LENGTH(*current_slice); |
| if (current_slice_size > prefix_bytes) { |
| /* TODO(klempner): Get rid of the extra refcount created here by adding a |
| native "trim the first N bytes" operation to splice */ |
| /* TODO(klempner): This really shouldn't be modifying the current slice |
| unless we own the slices array. */ |
| *current_slice = gpr_slice_split_tail(current_slice, prefix_bytes); |
| gpr_slice_unref(*current_slice); |
| return; |
| } else { |
| gpr_slice_unref(*current_slice); |
| ++state->first_slice; |
| ++current_slice; |
| prefix_bytes -= current_slice_size; |
| } |
| } |
| } |
| |
| static void slice_state_destroy(grpc_tcp_slice_state *state) { |
| while (slice_state_has_available(state)) { |
| gpr_slice_unref(state->slices[state->first_slice]); |
| ++state->first_slice; |
| } |
| |
| if (state->memory_owned) { |
| gpr_free(state->slices); |
| state->memory_owned = 0; |
| } |
| } |
| |
| void slice_state_transfer_ownership(grpc_tcp_slice_state *state, |
| gpr_slice **slices, size_t *nslices) { |
| *slices = state->slices + state->first_slice; |
| *nslices = state->last_slice - state->first_slice + 1; |
| |
| state->first_slice = -1; |
| state->last_slice = -1; |
| } |
| |
| /* Fills iov with the first min(iov_size, available) slices, returns number |
| filled */ |
| static size_t slice_state_to_iovec(grpc_tcp_slice_state *state, |
| struct iovec *iov, size_t iov_size) { |
| size_t nslices = state->last_slice - state->first_slice + 1; |
| gpr_slice *slices = state->slices + state->first_slice; |
| size_t i; |
| if (nslices < iov_size) { |
| iov_size = nslices; |
| } |
| |
| for (i = 0; i < iov_size; ++i) { |
| iov[i].iov_base = GPR_SLICE_START_PTR(slices[i]); |
| iov[i].iov_len = GPR_SLICE_LENGTH(slices[i]); |
| } |
| return iov_size; |
| } |
| |
| /* Makes n blocks available at the end of state, writes them into iov, and |
| returns the number of bytes allocated */ |
| static size_t slice_state_append_blocks_into_iovec(grpc_tcp_slice_state *state, |
| struct iovec *iov, size_t n, |
| size_t slice_size) { |
| size_t target_size; |
| size_t i; |
| size_t allocated_bytes; |
| ssize_t allocated_slices = slice_state_slices_allocated(state); |
| |
| if (n - state->working_slice_valid >= state->nslices - state->last_slice) { |
| /* Need to grow the slice array */ |
| target_size = state->nslices; |
| do { |
| target_size = target_size * 2; |
| } while (target_size < allocated_slices + n - state->working_slice_valid); |
| /* TODO(klempner): If this ever needs to support both prefix removal and |
| append, we should be smarter about the growth logic here */ |
| slice_state_realloc(state, target_size); |
| } |
| |
| i = 0; |
| allocated_bytes = 0; |
| |
| if (state->working_slice_valid) { |
| iov[0].iov_base = GPR_SLICE_END_PTR(state->slices[state->last_slice]); |
| iov[0].iov_len = GPR_SLICE_LENGTH(state->working_slice) - |
| GPR_SLICE_LENGTH(state->slices[state->last_slice]); |
| allocated_bytes += iov[0].iov_len; |
| ++i; |
| state->slices[state->last_slice] = state->working_slice; |
| state->working_slice_valid = 0; |
| } |
| |
| for (; i < n; ++i) { |
| ++state->last_slice; |
| state->slices[state->last_slice] = gpr_slice_malloc(slice_size); |
| iov[i].iov_base = GPR_SLICE_START_PTR(state->slices[state->last_slice]); |
| iov[i].iov_len = slice_size; |
| allocated_bytes += slice_size; |
| } |
| if (state->first_slice == -1) { |
| state->first_slice = 0; |
| } |
| return allocated_bytes; |
| } |
| |
| /* Remove the last n bytes from state */ |
| /* TODO(klempner): Consider having this defer actual deletion until later */ |
| static void slice_state_remove_last(grpc_tcp_slice_state *state, size_t bytes) { |
| while (bytes > 0 && slice_state_has_available(state)) { |
| if (GPR_SLICE_LENGTH(state->slices[state->last_slice]) > bytes) { |
| state->working_slice = state->slices[state->last_slice]; |
| state->working_slice_valid = 1; |
| /* TODO(klempner): Combine these into a single operation that doesn't need |
| to refcount */ |
| gpr_slice_unref(gpr_slice_split_tail( |
| &state->slices[state->last_slice], |
| GPR_SLICE_LENGTH(state->slices[state->last_slice]) - bytes)); |
| bytes = 0; |
| } else { |
| bytes -= GPR_SLICE_LENGTH(state->slices[state->last_slice]); |
| gpr_slice_unref(state->slices[state->last_slice]); |
| --state->last_slice; |
| if (state->last_slice == -1) { |
| state->first_slice = -1; |
| } |
| } |
| } |
| } |
| |
| typedef struct { |
| grpc_endpoint base; |
| grpc_em *em; |
| grpc_em_fd em_fd; |
| int fd; |
| size_t slice_size; |
| gpr_refcount refcount; |
| |
| grpc_endpoint_read_cb read_cb; |
| void *read_user_data; |
| gpr_timespec read_deadline; |
| grpc_endpoint_write_cb write_cb; |
| void *write_user_data; |
| gpr_timespec write_deadline; |
| |
| grpc_tcp_slice_state write_state; |
| } grpc_tcp; |
| |
| static void grpc_tcp_handle_read(void *arg /* grpc_tcp */, |
| grpc_em_cb_status status); |
| static void grpc_tcp_handle_write(void *arg /* grpc_tcp */, |
| grpc_em_cb_status status); |
| |
| #define DEFAULT_SLICE_SIZE 8192 |
| grpc_endpoint *grpc_tcp_create(int fd, grpc_em *em) { |
| return grpc_tcp_create_dbg(fd, em, DEFAULT_SLICE_SIZE); |
| } |
| |
| static void grpc_tcp_shutdown(grpc_endpoint *ep) { |
| grpc_tcp *tcp = (grpc_tcp *)ep; |
| grpc_em_fd_shutdown(&tcp->em_fd); |
| } |
| |
| static void grpc_tcp_unref(grpc_tcp *tcp) { |
| int refcount_zero = gpr_unref(&tcp->refcount); |
| if (refcount_zero) { |
| grpc_em_fd_destroy(&tcp->em_fd); |
| close(tcp->fd); |
| gpr_free(tcp); |
| } |
| } |
| |
| static void grpc_tcp_destroy(grpc_endpoint *ep) { |
| grpc_tcp *tcp = (grpc_tcp *)ep; |
| grpc_tcp_unref(tcp); |
| } |
| |
| static void call_read_cb(grpc_tcp *tcp, gpr_slice *slices, size_t nslices, |
| grpc_endpoint_cb_status status) { |
| grpc_endpoint_read_cb cb = tcp->read_cb; |
| |
| #ifdef GRPC_TRACE_TCP |
| size_t i; |
| gpr_log(GPR_DEBUG, "read: status=%d", status); |
| for (i = 0; i < nslices; i++) { |
| char *dump = |
| gpr_hexdump((char *)GPR_SLICE_START_PTR(slices[i]), |
| GPR_SLICE_LENGTH(slices[i]), GPR_HEXDUMP_PLAINTEXT); |
| gpr_log(GPR_DEBUG, "READ: %s", dump); |
| gpr_free(dump); |
| } |
| #endif |
| |
| tcp->read_cb = NULL; |
| cb(tcp->read_user_data, slices, nslices, status); |
| } |
| |
| #define INLINE_SLICE_BUFFER_SIZE 8 |
| #define MAX_READ_IOVEC 4 |
| static void grpc_tcp_handle_read(void *arg /* grpc_tcp */, |
| grpc_em_cb_status status) { |
| grpc_tcp *tcp = (grpc_tcp *)arg; |
| int iov_size = 1; |
| gpr_slice static_read_slices[INLINE_SLICE_BUFFER_SIZE]; |
| struct msghdr msg; |
| struct iovec iov[MAX_READ_IOVEC]; |
| ssize_t read_bytes; |
| ssize_t allocated_bytes; |
| struct grpc_tcp_slice_state read_state; |
| gpr_slice *final_slices; |
| size_t final_nslices; |
| |
| slice_state_init(&read_state, static_read_slices, INLINE_SLICE_BUFFER_SIZE, |
| 0); |
| |
| if (status == GRPC_CALLBACK_CANCELLED) { |
| call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_SHUTDOWN); |
| grpc_tcp_unref(tcp); |
| return; |
| } |
| |
| if (status == GRPC_CALLBACK_TIMED_OUT) { |
| call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_TIMED_OUT); |
| grpc_tcp_unref(tcp); |
| return; |
| } |
| |
| /* TODO(klempner): Limit the amount we read at once. */ |
| for (;;) { |
| allocated_bytes = slice_state_append_blocks_into_iovec( |
| &read_state, iov, iov_size, tcp->slice_size); |
| |
| msg.msg_name = NULL; |
| msg.msg_namelen = 0; |
| msg.msg_iov = iov; |
| msg.msg_iovlen = iov_size; |
| msg.msg_control = NULL; |
| msg.msg_controllen = 0; |
| msg.msg_flags = 0; |
| |
| do { |
| read_bytes = recvmsg(tcp->fd, &msg, 0); |
| } while (read_bytes < 0 && errno == EINTR); |
| |
| if (read_bytes < allocated_bytes) { |
| /* TODO(klempner): Consider a second read first, in hopes of getting a |
| * quick EAGAIN and saving a bunch of allocations. */ |
| slice_state_remove_last(&read_state, read_bytes < 0 |
| ? allocated_bytes |
| : allocated_bytes - read_bytes); |
| } |
| |
| if (read_bytes < 0) { |
| /* NB: After calling the user_cb a parallel call of the read handler may |
| * be running. */ |
| if (errno == EAGAIN) { |
| if (slice_state_has_available(&read_state)) { |
| /* TODO(klempner): We should probably do the call into the application |
| without all this junk on the stack */ |
| /* FIXME(klempner): Refcount properly */ |
| slice_state_transfer_ownership(&read_state, &final_slices, |
| &final_nslices); |
| call_read_cb(tcp, final_slices, final_nslices, GRPC_ENDPOINT_CB_OK); |
| slice_state_destroy(&read_state); |
| grpc_tcp_unref(tcp); |
| } else { |
| /* Spurious read event, consume it here */ |
| slice_state_destroy(&read_state); |
| grpc_em_fd_notify_on_read(&tcp->em_fd, grpc_tcp_handle_read, tcp, |
| tcp->read_deadline); |
| } |
| } else { |
| /* TODO(klempner): Log interesting errors */ |
| call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_ERROR); |
| slice_state_destroy(&read_state); |
| grpc_tcp_unref(tcp); |
| } |
| return; |
| } else if (read_bytes == 0) { |
| /* 0 read size ==> end of stream */ |
| if (slice_state_has_available(&read_state)) { |
| /* there were bytes already read: pass them up to the application */ |
| slice_state_transfer_ownership(&read_state, &final_slices, |
| &final_nslices); |
| call_read_cb(tcp, final_slices, final_nslices, GRPC_ENDPOINT_CB_EOF); |
| } else { |
| call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_EOF); |
| } |
| slice_state_destroy(&read_state); |
| grpc_tcp_unref(tcp); |
| return; |
| } else if (iov_size < MAX_READ_IOVEC) { |
| ++iov_size; |
| } |
| } |
| } |
| |
| static void grpc_tcp_notify_on_read(grpc_endpoint *ep, grpc_endpoint_read_cb cb, |
| void *user_data, gpr_timespec deadline) { |
| grpc_tcp *tcp = (grpc_tcp *)ep; |
| GPR_ASSERT(tcp->read_cb == NULL); |
| tcp->read_cb = cb; |
| tcp->read_user_data = user_data; |
| tcp->read_deadline = deadline; |
| gpr_ref(&tcp->refcount); |
| grpc_em_fd_notify_on_read(&tcp->em_fd, grpc_tcp_handle_read, tcp, deadline); |
| } |
| |
| #define MAX_WRITE_IOVEC 16 |
| static grpc_endpoint_write_status grpc_tcp_flush(grpc_tcp *tcp) { |
| struct msghdr msg; |
| struct iovec iov[MAX_WRITE_IOVEC]; |
| int iov_size; |
| ssize_t sent_length; |
| grpc_tcp_slice_state *state = &tcp->write_state; |
| |
| for (;;) { |
| iov_size = slice_state_to_iovec(state, iov, MAX_WRITE_IOVEC); |
| |
| msg.msg_name = NULL; |
| msg.msg_namelen = 0; |
| msg.msg_iov = iov; |
| msg.msg_iovlen = iov_size; |
| msg.msg_control = NULL; |
| msg.msg_controllen = 0; |
| msg.msg_flags = 0; |
| |
| do { |
| /* TODO(klempner): Cork if this is a partial write */ |
| sent_length = sendmsg(tcp->fd, &msg, 0); |
| } while (sent_length < 0 && errno == EINTR); |
| |
| if (sent_length < 0) { |
| if (errno == EAGAIN) { |
| return GRPC_ENDPOINT_WRITE_PENDING; |
| } else { |
| /* TODO(klempner): Log some of these */ |
| slice_state_destroy(state); |
| return GRPC_ENDPOINT_WRITE_ERROR; |
| } |
| } |
| |
| /* TODO(klempner): Probably better to batch this after we finish flushing */ |
| slice_state_remove_prefix(state, sent_length); |
| |
| if (!slice_state_has_available(state)) { |
| return GRPC_ENDPOINT_WRITE_DONE; |
| } |
| }; |
| } |
| |
| static void grpc_tcp_handle_write(void *arg /* grpc_tcp */, |
| grpc_em_cb_status status) { |
| grpc_tcp *tcp = (grpc_tcp *)arg; |
| grpc_endpoint_write_status write_status; |
| grpc_endpoint_cb_status cb_status; |
| grpc_endpoint_write_cb cb; |
| |
| cb_status = GRPC_ENDPOINT_CB_OK; |
| |
| if (status == GRPC_CALLBACK_CANCELLED) { |
| cb_status = GRPC_ENDPOINT_CB_SHUTDOWN; |
| } else if (status == GRPC_CALLBACK_TIMED_OUT) { |
| cb_status = GRPC_ENDPOINT_CB_TIMED_OUT; |
| } |
| |
| if (cb_status != GRPC_ENDPOINT_CB_OK) { |
| slice_state_destroy(&tcp->write_state); |
| cb = tcp->write_cb; |
| tcp->write_cb = NULL; |
| cb(tcp->write_user_data, cb_status); |
| grpc_tcp_unref(tcp); |
| return; |
| } |
| |
| write_status = grpc_tcp_flush(tcp); |
| if (write_status == GRPC_ENDPOINT_WRITE_PENDING) { |
| grpc_em_fd_notify_on_write(&tcp->em_fd, grpc_tcp_handle_write, tcp, |
| tcp->write_deadline); |
| } else { |
| slice_state_destroy(&tcp->write_state); |
| if (write_status == GRPC_ENDPOINT_WRITE_DONE) { |
| cb_status = GRPC_ENDPOINT_CB_OK; |
| } else { |
| cb_status = GRPC_ENDPOINT_CB_ERROR; |
| } |
| cb = tcp->write_cb; |
| tcp->write_cb = NULL; |
| cb(tcp->write_user_data, cb_status); |
| grpc_tcp_unref(tcp); |
| } |
| } |
| |
| static grpc_endpoint_write_status grpc_tcp_write( |
| grpc_endpoint *ep, gpr_slice *slices, size_t nslices, |
| grpc_endpoint_write_cb cb, void *user_data, gpr_timespec deadline) { |
| grpc_tcp *tcp = (grpc_tcp *)ep; |
| grpc_endpoint_write_status status; |
| |
| #ifdef GRPC_TRACE_TCP |
| size_t i; |
| |
| for (i = 0; i < nslices; i++) { |
| char *data = |
| gpr_hexdump((char *)GPR_SLICE_START_PTR(slices[i]), |
| GPR_SLICE_LENGTH(slices[i]), GPR_HEXDUMP_PLAINTEXT); |
| gpr_log(GPR_DEBUG, "WRITE %p: %s", tcp, data); |
| gpr_free(data); |
| } |
| #endif |
| |
| GPR_ASSERT(tcp->write_cb == NULL); |
| slice_state_init(&tcp->write_state, slices, nslices, nslices); |
| |
| status = grpc_tcp_flush(tcp); |
| if (status == GRPC_ENDPOINT_WRITE_PENDING) { |
| /* TODO(klempner): Consider inlining rather than malloc for small nslices */ |
| slice_state_realloc(&tcp->write_state, nslices); |
| gpr_ref(&tcp->refcount); |
| tcp->write_cb = cb; |
| tcp->write_user_data = user_data; |
| tcp->write_deadline = deadline; |
| grpc_em_fd_notify_on_write(&tcp->em_fd, grpc_tcp_handle_write, tcp, |
| tcp->write_deadline); |
| } |
| |
| return status; |
| } |
| |
| static const grpc_endpoint_vtable vtable = {grpc_tcp_notify_on_read, |
| grpc_tcp_write, grpc_tcp_shutdown, |
| grpc_tcp_destroy}; |
| |
| grpc_endpoint *grpc_tcp_create_dbg(int fd, grpc_em *em, size_t slice_size) { |
| grpc_tcp *tcp = (grpc_tcp *)gpr_malloc(sizeof(grpc_tcp)); |
| tcp->base.vtable = &vtable; |
| tcp->fd = fd; |
| tcp->em = em; |
| tcp->read_cb = NULL; |
| tcp->write_cb = NULL; |
| tcp->read_user_data = NULL; |
| tcp->write_user_data = NULL; |
| tcp->slice_size = slice_size; |
| tcp->read_deadline = gpr_inf_future; |
| tcp->write_deadline = gpr_inf_future; |
| slice_state_init(&tcp->write_state, NULL, 0, 0); |
| /* paired with unref in grpc_tcp_destroy */ |
| gpr_ref_init(&tcp->refcount, 1); |
| grpc_em_fd_init(&tcp->em_fd, tcp->em, fd); |
| return &tcp->base; |
| } |