summary refs log tree commit diff
diff options
context:
space:
mode:
authorfranck cuny <franck@lumberjaph.net>2010-01-23 19:36:24 +0100
committerfranck cuny <franck@lumberjaph.net>2010-01-23 19:36:24 +0100
commita7cc690ced15e1a0191d27034006bfb17a0deeb5 (patch)
tree6cef1a2e07727e8cd5249764f461222073e8211a
downloadgithub-explorer-a7cc690ced15e1a0191d27034006bfb17a0deeb5.tar.gz
basic github crawler using api
-rw-r--r--crawl.pl25
-rw-r--r--lib/githubexplorer.pm51
-rw-r--r--lib/githubexplorer/Gexf.pm27
-rw-r--r--lib/githubexplorer/Profile.pm59
-rw-r--r--lib/githubexplorer/Repositorie.pm52
-rw-r--r--lib/githubexplorer/Schema.pm7
-rw-r--r--lib/githubexplorer/Schema/Result/Follow.pm18
-rw-r--r--lib/githubexplorer/Schema/Result/Profiles.pm28
-rw-r--r--lib/githubexplorer/Schema/Result/Repositories.pm23
9 files changed, 290 insertions, 0 deletions
diff --git a/crawl.pl b/crawl.pl
new file mode 100644
index 0000000..fa7ae4e
--- /dev/null
+++ b/crawl.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use lib ('lib');
+use githubexplorer;
+use Getopt::Long;
+
+GetOptions(
+    'deploy'   => \my $deploy,
+    'profiles' => \my $profiles,
+    'repo'     => \my $repo
+);
+
+my $gh = githubexplorer->new(
+    seed      => [qw/franckcuny/],
+    api_token => $ENV{'GITHUB_APIKEY'},
+    api_login => $ENV{'GITHUB_LOGIN'},
+    with_repo => $repo,
+    connect_info =>
+        [ 'dbi:SQLite:dbname=test.sqlite', '', '', { AutoCommit => 1 } ],
+);
+
+$gh->deploy if $deploy;
+$gh->harvest_profiles;
+
diff --git a/lib/githubexplorer.pm b/lib/githubexplorer.pm
new file mode 100644
index 0000000..fdd609a
--- /dev/null
+++ b/lib/githubexplorer.pm
@@ -0,0 +1,51 @@
+package githubexplorer;
+use 5.010;
+use lib ('/home/franck/code/git/net-github/lib');
+use YAML::Syck;
+use Moose;
+use githubexplorer::Schema;
+
+with qw/githubexplorer::Profile githubexplorer::Repositorie/;
+
+has seed         => ( isa => 'ArrayRef', is => 'ro', required => 1 );
+has api_login    => ( isa => 'Str',      is => 'ro', required => 1 );
+has api_token    => ( isa => 'Str',      is => 'ro', required => 1 );
+has connect_info => ( isa => 'ArrayRef', is => 'ro', required => 1 );
+has with_repo    => ( isa => 'Bool',     is => 'ro', default  => sub {0} );
+has schema => (
+    isa       => 'githubexplorer::Schema',
+    is        => 'rw',
+    predicate => 'has_schema'
+);
+
+sub deploy {
+    my ($self) = @_;
+    $self->_connect() unless $self->has_schema;
+    $self->schema->deploy;
+}
+
+sub _connect {
+    my $self = shift;
+    $self->schema(
+        githubexplorer::Schema->connect( @{ $self->connect_info } ) );
+}
+
+sub harvest_profiles {
+    my ( $self, $depth) = @_;
+    $self->_connect() unless $self->has_schema;
+    $depth //= 1;
+    foreach my $login ( @{ $self->seed } ) {
+        $self->fetch_profile($login, $depth);
+    }
+}
+
+sub harvest_repo {
+    my ($self) = @_;
+    $self->_connect unless $self->has_schema;
+    my $profiles = $self->schema->resultset('Profiles')->search();
+    while (my $p = $profiles->next) {
+        $self->fetch_repo($p);
+    }
+}
+
+1;
diff --git a/lib/githubexplorer/Gexf.pm b/lib/githubexplorer/Gexf.pm
new file mode 100644
index 0000000..a82a741
--- /dev/null
+++ b/lib/githubexplorer/Gexf.pm
@@ -0,0 +1,27 @@
+package githubexplorer::Gexf;
+
+use Moose;
+use XML::Simple;
+
+has graph => (
+    is      => 'rw',
+    isa     => 'HashRef',
+    default => sub {
+        my $graph = {
+            gexf => {
+                version => "1.0",
+                meta    => { creator => ['rtgi'] },
+                graph   => {
+                    type       => 'static',
+                    attributes => {
+                        class     => 'node',
+                        type      => 'static',
+                        attribute => [ { id => 0, type => 'string' } ]
+                    }
+                }
+            }
+        };
+    }
+);
+
+1;
diff --git a/lib/githubexplorer/Profile.pm b/lib/githubexplorer/Profile.pm
new file mode 100644
index 0000000..f580f79
--- /dev/null
+++ b/lib/githubexplorer/Profile.pm
@@ -0,0 +1,59 @@
+package githubexplorer::Profile;
+use 5.010;
+use Moose::Role;
+use Net::GitHub::V2::Users;
+
+sub fetch_profile {
+    my ( $self, $login, $depth ) = @_;
+
+    my $profile = $self->_profile_exists($login);
+
+    say "fetch profile for $login ($depth)...";
+    sleep(1);
+    my $github = Net::GitHub::V2::Users->new(
+        owner => $login,
+        login => $self->api_login,
+        token => $self->api_token,
+    );
+    sleep(2);
+
+    if ( !$profile ) {
+        $profile = $self->_create_profile( $login, $github->show, $depth );
+        if ( $self->with_repo ) {
+            foreach my $repo ( @{ $github->list } ) {
+                $self->fetch_repo( $profile, $repo->{name} );
+            }
+        }
+        sleep(1);
+    }
+    my $followers   = $github->followers();
+    my $local_depth = $depth + 1;
+    return $profile if $local_depth > 3;
+    foreach my $f (@$followers) {
+        my $p = $self->fetch_profile( $f, $depth + 1 );
+        next unless $p;
+        $self->schema->resultset('Follow')
+            ->create(
+            { id_following => $profile->id, id_follower => $p->id } );
+    }
+    $profile;
+}
+
+sub _profile_exists {
+    my ( $self, $login ) = @_;
+    my $profile
+        = $self->schema->resultset('Profiles')->find( { login => $login } );
+    return $profile;
+}
+
+sub _create_profile {
+    my ( $self, $user_name, $profile, $depth ) = @_;
+
+    $profile->{depth} = $depth;
+
+    my $profile_rs = $self->schema->resultset('Profiles')->create($profile);
+    say $profile_rs->login."'s profile created";
+    return $profile_rs;
+}
+
+1;
diff --git a/lib/githubexplorer/Repositorie.pm b/lib/githubexplorer/Repositorie.pm
new file mode 100644
index 0000000..907a3b8
--- /dev/null
+++ b/lib/githubexplorer/Repositorie.pm
@@ -0,0 +1,52 @@
+package githubexplorer::Repositorie;
+use 5.010;
+use Moose::Role;
+use Net::GitHub::V2::Repositories;
+
+sub fetch_repo {
+    my ( $self, $profile, $repo_name ) = @_;
+
+    return if $self->_repo_exists($profile, $repo_name);
+
+    say "check ".$profile->login."'s $repo_name";
+    sleep(1);
+    my $github = Net::GitHub::V2::Repositories->new(
+        owner => $profile->login,
+        repo  => $repo_name,
+        login => $self->api_login,
+        token => $self->api_token,
+    );
+    my $langs = [ keys %{ $github->languages() } ];
+    sleep(1);
+    return unless grep {/perl/i} @$langs;
+    my $repo_desc = $github->show();
+    $repo_desc->{languages} = $langs;
+    $self->_create_repo( $profile, $repo_desc );
+    sleep(1);
+}
+
+sub _repo_exists {
+    my ( $self, $profile, $repo_name ) = @_;
+    return
+        if $self->schema->resultset('Repositories')
+            ->find( { name => $repo_name, id_profile => $profile->id } );
+}
+
+sub _create_repo {
+    my ( $self, $profile, $repo_desc ) = @_;
+
+    my $repo_rs = $self->schema->resultset('Repositories')
+        ->find( { id_profile => $profile->id, name => $repo_desc->{name} } );
+    if ( !$repo_rs ) {
+        my $repo_insert = {
+            id_profile => $profile->id,
+            map { $_ => $repo_desc->{$_} }
+                (qw/description name homepage url watchers forks/)
+        };
+        $repo_rs
+            = $self->schema->resultset('Repositories')->create($repo_insert);
+    }
+    $repo_rs;
+}
+
+1;
diff --git a/lib/githubexplorer/Schema.pm b/lib/githubexplorer/Schema.pm
new file mode 100644
index 0000000..306480c
--- /dev/null
+++ b/lib/githubexplorer/Schema.pm
@@ -0,0 +1,7 @@
+package githubexplorer::Schema;
+
+use base qw/DBIx::Class::Schema/;
+
+__PACKAGE__->load_namespaces();
+
+1;
diff --git a/lib/githubexplorer/Schema/Result/Follow.pm b/lib/githubexplorer/Schema/Result/Follow.pm
new file mode 100644
index 0000000..735980b
--- /dev/null
+++ b/lib/githubexplorer/Schema/Result/Follow.pm
@@ -0,0 +1,18 @@
+package githubexplorer::Schema::Result::Follow;
+
+use base qw/DBIx::Class/;
+
+__PACKAGE__->load_components(qw/Core/);
+__PACKAGE__->table('follow');
+
+__PACKAGE__->add_columns(
+    id_follower  => { data_type => 'int', },
+    id_following => { data_type => 'int' },
+);
+__PACKAGE__->set_primary_key(qw/id_follower id_following/);
+__PACKAGE__->belongs_to( 'id_follower',
+    'githubexplorer::Schema::Result::Profiles' );
+__PACKAGE__->belongs_to( 'id_following',
+    'githubexplorer::Schema::Result::Profiles' );
+
+1;
diff --git a/lib/githubexplorer/Schema/Result/Profiles.pm b/lib/githubexplorer/Schema/Result/Profiles.pm
new file mode 100644
index 0000000..001057e
--- /dev/null
+++ b/lib/githubexplorer/Schema/Result/Profiles.pm
@@ -0,0 +1,28 @@
+package githubexplorer::Schema::Result::Profiles;
+
+use base qw/DBIx::Class/;
+
+__PACKAGE__->load_components(qw/Core/);
+__PACKAGE__->table('profiles');
+__PACKAGE__->add_columns(
+    id                => { data_type => 'integer', },
+    login             => { data_type => 'varchar' },
+    blog              => { data_type => 'varchar', is_nullable => 1 },
+    company           => { data_type => 'varchar', is_nullable => 1 },
+    created_at        => { data_type => 'timestamp' },
+    email             => { data_type => 'varchar', is_nullable => 1 },
+    followers_count   => { data_type => 'int' },
+    following_count   => { data_type => 'int' },
+    gravatar_id       => { data_type => 'varchar', is_nullable => 1 },
+    location          => { data_type => 'varchar', is_nullable => 1 },
+    name              => { data_type => 'varchar', is_nullable => 1 },
+    public_gist_count => { data_type => 'int' },
+    public_repo_count => { data_type => 'int' },
+    depth             => { data_type => 'boolean' },
+);
+
+__PACKAGE__->set_primary_key('id');
+__PACKAGE__->has_many( 'get_repos',
+    'githubexplorer::Schema::Result::Repositories', 'id_profile' );
+
+1;
diff --git a/lib/githubexplorer/Schema/Result/Repositories.pm b/lib/githubexplorer/Schema/Result/Repositories.pm
new file mode 100644
index 0000000..641305f
--- /dev/null
+++ b/lib/githubexplorer/Schema/Result/Repositories.pm
@@ -0,0 +1,23 @@
+package githubexplorer::Schema::Result::Repositories;
+
+use base qw/DBIx::Class/;
+
+__PACKAGE__->load_components(qw/Core/);
+__PACKAGE__->table('repositories');
+__PACKAGE__->add_columns(
+    id          => { data_type => 'integer', is_auto_increment => 1 },
+    description => { data_type => 'text',    is_nullable       => 1 },
+    name        => { data_type => 'varchar' },
+    homepage    => { data_type => 'varchar', is_nullable       => 1 },
+    url         => { data_type => 'varchar', is_nullable       => 1 },
+    watchers    => { data_type => 'int' },
+    forks       => { data_type => 'int' },
+    id_profile  => { data_type => 'int',     is_foreign_key    => 1 },
+);
+
+__PACKAGE__->set_primary_key('id');
+__PACKAGE__->belongs_to( 'id_profile',
+    'githubexplorer::Schema::Result::Profiles' );
+__PACKAGE__->add_unique_constraint( [qw/name id_profile/] );
+
+1;