How to retrieve a binary file from a http// in C and linux

Hello, I'm writing a program to download videos from youtube in C. It's based on the youtube-dl software that's written in python. I know that exists libcurl and others to help on this but I'd to do it via TCP to have a better understanding of the process. I already change messages with the site(receive html) but I can retrieve the binary content. My code is the following:

#include <sys/socket.h>
#include <arpa/inet.h>
#include <netdb.h>

#include <errno.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <regex.h>

char * http_post(const char * url, const char * params);
#define MAX_BUFFER 204800
char buffer[MAX_BUFFER+1];
char* match(const char *string, char *pattern)
{
    int    status;
    regex_t    re;
    regmatch_t match;
    char * tparam = NULL;
    if (regcomp(&re, pattern, REG_EXTENDED) != 0) {
        return NULL;      /* report error */
    }
   
    status = regexec(&re, string, 1, &match, 0);
    regfree(&re);
    if (status != 0) {
        return NULL;      /* report error */
    }
   
    tparam = (char*)malloc(sizeof(char)*match.rm_eo - match.rm_so);
    strncpy(tparam,&(string[match.rm_so+4]),match.rm_eo - match.rm_so-5);
    printf("tparam=%s\n",tparam);
    return tparam;
}



int main(){
 
   char data_location[50];
   FILE * fd = NULL;
   char * tparam = NULL;
   char video_id[]= {"V36AJg6L_3o&mode=related&search="};
   char * ret;

   http_post("www.youtube.com","/watch?v=V36AJg6L_3o&mode=related&search=");
   tparam = match(buffer,"[,{]t:'([^']*)'");
   printf("retornou\n");
  char tmp[64];
  sprintf(tmp,"/get_video?video_id=%s&t=%s",video_id, tparam);
   http_post("www.youtube.com", tmp);
   free(tparam);
   
   char * ptr = strstr(buffer,"Location:");
   if(ptr != NULL){  
      char *ptr_end = strstr(ptr+10,"\n");
      int i = 0;
      while(ptr+10 < ptr_end){
         data_location[i] = *(ptr+10) ;
         i++; ptr++;
      }
      data_location[i] = 0 ;
   }
 
   ret = http_post("cache.googlevideo.com","/get_video?video_id=V36AJg6L_3o&origin=lax-v220.lax.youtube.com");
   printf("%s\n",buffer);
   /*
   if((fd = fopen("video.flv","wb")) == NULL){
      printf("erro ao abrir arquivo\n");
   }
   
   fwrite((void*)buffer,MAX_BUFFER,1,fd);
   fclose(fd); */
   return 0 ;
}


char * http_post(const char * url, const char * params){
   int connectionFd;
   int in;
   unsigned long limit = MAX_BUFFER ,index =0;
   struct sockaddr_in servaddr;
   struct hostent * hptr = NULL;
   char data_location[50];
   FILE * fd = NULL;
   
   char tmp[512];
   memset(buffer,0,MAX_BUFFER+1);
   memset(tmp,0,512);
   if(strcmp(url,"cache.googlevideo.com") == 0){ // here is the problem. We've the right url wich is
                                                                             // hardcoded below, but I can't revice the binary.
                                                                             // the headers is to mimic firefox
      sprintf(tmp,"GET /get_video?video_id=V36AJg6L_3o&origin=lax-v220.lax.youtube.com HTTP/1.1\r\n\
Host: rd.cache.l.google.com\r\n\
User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0\r\n\
Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nAccept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png, */*;q=0.5\r\n\
Accept-Language: en-us,en;q=0.5\r\n\
Accept-Encoding: gzip,deflate\r\n\
Keep-Alive: 300\r\n\
Connection: keep-alive\r\n\
Cookie:rememberme=false;PREF=ID=a624e297dff7318f:TM=1192648634:LM=1192650677:GM=1:S=qkuRlALR6e9KGcIH\r\n\n\0",params);
   printf("link:%s\n",tmp);
   }
   else{
      sprintf(tmp,"POST %s HTTP/1.0\r\n \
            User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0\r\n \
                  Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n  Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png, */*;q=0.5\r\n \
                  Accept-Language: en-us,en;q=0.5\r\n\r\n",params);
   }
      char * string = (char*)malloc(sizeof(char)*strlen(tmp)+1);
      memset(string, 0, sizeof(char)*strlen(tmp)+1);
      strcpy(string,tmp);
      connectionFd = socket(AF_INET,SOCK_STREAM,0);
      memset(&servaddr,0,sizeof(servaddr));
      servaddr.sin_family = AF_INET;
      servaddr.sin_port = htons(80);
      hptr = gethostbyname2(url,AF_INET);
   
      if(hptr == NULL){
         printf("não conseguiu resolver nome\n");
         return NULL;
      }
     
      servaddr.sin_addr.s_addr      =   *(unsigned long *) *(hptr->h_addr_list);
   
      if(connect(connectionFd,(struct sockaddr*)&servaddr,sizeof(servaddr)) ){
         printf("erro ao conectar: %s\n",errno);
      }
 
      write(connectionFd, string, strlen(string));
      while((in=read(connectionFd, &(buffer[index]), limit)) >0 ){
         index += in;
         limit -= in;
         printf("limit=%d\n",limit);
      }
     
      close(connectionFd);
      buffer[index] = 0 ;
      return NULL;
}
fabytesAsked:
Who is Participating?
 
jkrConnect With a Mentor Commented:
All binary data that you receive will be in MIME format (base64-encoded text), so you need to read that and then decode it. You will find portable source code all over the net, the most prominent beint the "uudecode" source code in ftp://mirrors.kernel.org/gnu/sharutils/
0
 
NopiusConnect With a Mentor Commented:

Hi, fabytes

if(strcmp(url,"cache.googlevideo.com") == 0){ // here is the problem. We've the right url wich is
                                                                             // hardcoded below, but I can't revice the binary.
                                                                             // the headers is to mimic firefox

Do you mean, this condition doesn't work as it should?
If the problem is not in that confition, I see 2 other possible problems:

1) Cookie is incorrect. You are using fixed cookie, not a dynamic coocie from the response from the server:
Cookie:rememberme=false;PREF=ID=a624e297dff7318f:TM=1192648634:LM=1192650677:GM=1:S=qkuRlALR6e9KGcIH\r\n\n\0
I guess the cookie value should be caugth from server's response.

2) You have incorrect read loop:
while((in=read(connectionFd, &(buffer[index]), limit)) >0 ){
         index += in;
         limit -= in;
         printf("limit=%d\n",limit);
      }

Don't rely on zero response from read() syscall. Rely on a Content-Length, cought as a header in response to GET/POST request. When connection is keep-alive, your loop would block at the end of the data. Also don't read into the same 200k buffer, its too small for large binary content.

0
Question has a verified solution.

Are you are experiencing a similar issue? Get a personalized answer when you ask a related question.

Have a better answer? Share it in a comment.

All Courses

From novice to tech pro — start learning today.