summary refs log tree commit diff
diff options
context:
space:
mode:
authorfranck cuny <franck@lumberjaph.net>2010-02-12 16:41:02 +0100
committerfranck cuny <franck@lumberjaph.net>2010-02-12 16:41:02 +0100
commitd2551c9cc2e637835876fec5e9cb58f9e9f2061c (patch)
tree3f1ab54705e1a21d3978cd0da73ed8c12137074f
parentMerge branch 'master' of lj:github-explorer (diff)
downloadgithub-explorer-d2551c9cc2e637835876fec5e9cb58f9e9f2061c.tar.gz
wip
-rw-r--r--clean-country.pl34
-rw-r--r--crawl.pl2
-rw-r--r--lib/githubexplorer.pm29
-rw-r--r--lib/githubexplorer/Gexf.pm362
-rw-r--r--lib/githubexplorer/Schema/Result/Profiles.pm2
5 files changed, 303 insertions, 126 deletions
diff --git a/clean-country.pl b/clean-country.pl
new file mode 100644
index 0000000..9fb0b5c
--- /dev/null
+++ b/clean-country.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use lib ('lib');
+use 5.010;
+use Geo::GeoNames;
+use githubexplorer::Schema;
+use YAML::Syck;
+
+my $conf = LoadFile(shift);
+
+my $schema = githubexplorer::Schema->connect(@{$conf->{connect_info}});
+
+my $profiles = $schema->resultset('Profiles')->search({id => {'>' => 55781}, location => {'!=' =>
+            undef}, location => {'!=' => ''}});
+
+my $geo = Geo::GeoNames->new();
+
+while (my $pr = $profiles->next) {
+    next if $pr->location =~ /^http/;
+    next if $pr->country;
+    next if $pr->location =~ /earth/i;
+    say "-> process ".$pr->login." with ".$pr->location;
+    my $result = $geo->search( q => $pr->location, maxRows => 1 );
+    my $res = shift @$result;
+    if ($res) {
+        eval {
+            $pr->update({city => $res->{name}, country => $res->{countryName}});
+        };
+        next if $@;
+        say "** fix with ".$pr->city . " in ".$pr->country;
+    }
+    sleep(1);
+}
\ No newline at end of file
diff --git a/crawl.pl b/crawl.pl
index d0d911b..d844893 100644
--- a/crawl.pl
+++ b/crawl.pl
@@ -12,6 +12,7 @@ GetOptions(
     'repo'     => \my $repo,
     'graph'    => \my $graph,
     'network'  => \my $network,
+    'seed'  => \my $seed,
     'conf=s'   => \my $conf,
 );
 
@@ -30,3 +31,4 @@ $gh->harvest_profiles if $profiles;
 $gh->harvest_repo     if $repo;
 $gh->graph_repo       if $network;
 $gh->gen_graph        if $graph;
+$gh->gen_seed         if $seed;
diff --git a/lib/githubexplorer.pm b/lib/githubexplorer.pm
index 4260842..5744e08 100644
--- a/lib/githubexplorer.pm
+++ b/lib/githubexplorer.pm
@@ -25,8 +25,8 @@ has seed => (
         return \@seeds;
     }
 );
-has api_login    => ( isa => 'Str',      is => 'ro', required => 1 );
-has api_token    => ( isa => 'Str',      is => 'ro', required => 1 );
+has api_login    => ( isa => 'Str|Undef',      is => 'ro', required => 1 );
+has api_token    => ( isa => 'Str|Undef',      is => 'ro', required => 1 );
 has connect_info => ( isa => 'ArrayRef', is => 'ro', required => 1 );
 has with_repo    => ( isa => 'Bool',     is => 'ro', default  => sub {0} );
 has schema => (
@@ -69,8 +69,7 @@ sub gen_graph {
     my $self = shift;
     $self->_connect unless $self->has_schema;
     my $graph = githubexplorer::Gexf->new( schema => $self->schema );
-    my $xml = $graph->gen_gexf;
-    $xml > io('crawl.gexf');
+    $graph->gen_gexf;
 }
 
 sub graph_repo {
@@ -82,14 +81,32 @@ sub graph_repo {
     }
 }
 
-sub extract_seed {
+sub gen_seed {
     my $self = shift;
     $self->_connect unless $self->has_schema;
     my $profiles = $self->schema->resultset('Profiles')
         ->search( { blog => { '!=' => undef }, blog => { '!=' => '' } } );
+
+    open my $fh, '>', 'seed.csv';
     while ( my $pr = $profiles->next ) {
+        my %languages;
+        my $forks = $self->schema->resultset('Fork')->search({profile =>
+                $pr->id});
+        while (my $fork = $forks->next) {
+            my $languages =
+            $self->schema->resultset('RepoLang')->search({repository =>
+                    $fork->repos->id});
+            while (my $lang = $languages->next) {
+                $languages{$lang->language->name}+=$lang->size;
+            }
+        }
+        my @sorted_lang = sort {$languages{$b} <=> $languages{$a}} keys %languages;
+        my $main_lang = shift @sorted_lang;
+        my $other_lang = join('|', @sorted_lang);
+        my $str = $profiles->blog.";;;github;".$main_lang.";".$other_lang.";".$profile->country."\n";
+        print $fh $str;
     }
+    close $fh;
 }
 
-
 1;
diff --git a/lib/githubexplorer/Gexf.pm b/lib/githubexplorer/Gexf.pm
index f7e38cb..58281d4 100644
--- a/lib/githubexplorer/Gexf.pm
+++ b/lib/githubexplorer/Gexf.pm
@@ -9,113 +9,114 @@ has id_edges => (is => 'rw', isa => 'Num', traits  => ['Counter'], default =>
 0, handles => {inc_edges => 'inc'});
 
 has graph => (
-    is      => 'rw',
-    isa     => 'HashRef',
-    default => sub {
-        my $graph = {
-            gexf => {
-                version => "1.1",
-                meta    => { creator => ['linkfluence'] },
-                graph   => {
-                    type       => 'static',
-                    attributes => {
-                        class     => 'node',
-                        type      => 'static',
-                        attribute => [
-                            {
-                                id    => 0,
-                                type  => 'float',
-                                title => 'name'
-                            },
-                            {
-                                id => 1,
-                                type => 'string',
-                                title => 'type',
-                            },
-                            {
-                                id    => 2,
-                                type  => 'float',
-                                title => 'followers_count'
-                            },
-                            {
-                                id    => 3,
-                                type  => 'float',
-                                title => 'following_count'
-                            },
-                            {
-                                id => 4,
-                                type => 'float',
-                                title => 'forks',
-                            },
-                            {
-                                id => 5,
-                                type => 'string',
-                                title => 'location',
-                            },
-                            {
-                                id => 6,
-                                type => 'float',
-                                title => 'public_gist_count',
-                            },
-                            {
-                                id => 7,
-                                type => 'float',
-                                title => 'public_repo_count',
-                            },
-                            {
-                                id => 8,
-                                type => 'string',
-                                title => 'language',
-                            },
-                            {
-                                id => 9,
-                                type => 'string',
-                                title => 'description',
-                            },
-                            {
-                                id => 10,
-                                type => 'float',
-                                title => 'watchers',
-                            }
-                        ]
-                    }
+is      => 'rw',
+isa     => 'HashRef',
+default => sub {
+    my $graph = {
+        gexf => {
+            version => "1.1",
+            meta    => { creator => ['linkfluence'] },
+            graph   => {
+                type       => 'static',
+                attributes => {
+                    class     => 'node',
+                    type      => 'static',
+                    attribute => [
+                        {
+                            id    => 0,
+                            type  => 'float',
+                            title => 'name'
+                        },
+                        {
+                            id => 1,
+                            type => 'string',
+                            title => 'type',
+                        },
+                        {
+                            id    => 2,
+                            type  => 'float',
+                            title => 'followers_count'
+                        },
+                        {
+                            id    => 3,
+                            type  => 'float',
+                            title => 'following_count'
+                        },
+                        {
+                            id => 4,
+                            type => 'float',
+                            title => 'forks',
+                        },
+                        {
+                            id => 5,
+                            type => 'string',
+                            title => 'location',
+                        },
+                        {
+                            id => 6,
+                            type => 'float',
+                            title => 'public_gist_count',
+                        },
+                        {
+                            id => 7,
+                            type => 'float',
+                            title => 'public_repo_count',
+                        },
+                        {
+                            id => 8,
+                            type => 'string',
+                            title => 'language',
+                        },
+                        {
+                            id => 9,
+                            type => 'string',
+                            title => 'description',
+                        },
+                        {
+                            id => 10,
+                            type => 'float',
+                            title => 'watchers',
+                        }
+                    ]
                 }
             }
-        };
-    }
+        }
+    };
+}
 );
 
 sub gen_gexf {
     my $self = shift;
-    $self->profiles;
-    #$self->repositories;
-    say "total nodes : ".scalar (@{ $self->graph->{gexf}->{graph}->{nodes}->{node} });
-    say "total edges : ".scalar (@{ $self->graph->{gexf}->{graph}->{edges}->{edge} });
+
+    $self->basic_profiles;
+    my $basic_profiles = $self->dump_gexf;
+    $basic_profiles > io('basic_profiles.gexf');
+
+    $self->profiles_from_repositories;
+    my $profiles_from_repositories = $self->dump_gexf;
+    $profiles_from_repositories > io ('profiles_from_repositories.gexf');
+
+    $self->repositories_from_profiles;
+    my $repositories_from_profiles = $self->dump_gexf;
+    $profiles_from_repositories > io ('repositories_from_profiles.gexf');
+}
+
+sub dump_gefx {
+    my $self = shift;
     my $xml_out = XMLout( $self->graph, AttrIndent => 1, keepRoot => 1 );
+    $self->graph->{gexf}->{graph}->{nodes} = undef;
+    $self->graph->{gexf}->{graph}->{edges} = undef;
     return $xml_out;
 }
 
-sub profiles {
+sub basic_profiles {
     my $self     = shift;
-    say "start profiles ...";
+    $self->id_edges(0);
+    say "start basic_profiles ...";
     my $profiles = $self->schema->resultset('Profiles')->search();
 
     while ( my $profile = $profiles->next ) {
-        my $node = {
-            id              => $profile->id,
-            label           => $profile->login,
-            attvalues => {
-                attvalue => [
-                    { for => 0, value => $profile->name},
-                    { for => 1, value => "profile"},
-                    { for => 2, value => $profile->followers_count},
-                    { for => 3, value => $profile->following_count},
-                    { for => 5, value => $profile->location},
-                    { for => 6, value => $profile->public_gist_count},
-                    { for => 7, value => $profile->public_repo_count},
-                ]
-            },
-        };
+        my $node = $self->_get_node_for_profile($profile);
         push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $node;
     }
 
@@ -129,26 +130,57 @@ sub profiles {
         };
         push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e;
     }
-    say " done";
+    say "basic_profiles done";
 }
 
-sub repositories {
+sub profiles_from_repositories {
     my $self = shift;
+    $self->id_edges(0);
+    say "start profiles_from_repositories ...";
 
-    say "start repositories ...";
-    my $repositories = $self->schema->resultset('Repositories')->search({fork => 0});
+    my ($nodes);
+    my $profiles = $self->schema->resultset('Profiles')->search();
+    while (my $profile = $profiles->next) {
+        my $node = $self->_get_node_for_profile($profile);
+        push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $node;
+    }
+    my $repositories = $self->schema->resultset('Repositories')->search();
     while (my $repos = $repositories->next) {
+        my $forks = $self->schema->resultset('Fork')->search({repos => $repos->id});
+        my @profiles;
+        while (my $fork = $forks->next) {
+            push @profiles, $fork->profile->id;
+        }
+        foreach my $p (@profiles) {
+            map {
+                next if $_ eq $p;
+                my $e = {
+                    source => $p,
+                    target => $_,
+                    id => $self->inc_edges,
+                };
+                push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e;
+            } @profiles;
+        }
+    }
+    say "profiles_from_repositories done";
+}
+
+sub repositories_from_profiles {
+    my $self = shift;
+    $self->id_edges(0);
+    say "start repositories_from_profiles ...";
 
-        next if $repos->name =~ /dotfiles/i;
-        # available in forks ?
-        my $check_fork = $self->schema->resultset('Fork')->search({repos => $repos->id});
-        next if $check_fork->count < 1;
+    my ($nodes);
+    my $repositories = $self->schema->resultset('Repositories')->search();
+    while (my $repos = $repositories->next) {
+        next if $repos->name =~ /dotfiles/;
 
-        if (!grep {$_->{id} eq "repos_".$repos->name} @{$self->graph->{gexf}->{graph}->{nodes}->{node}}) {
+        if (!exists $nodes->{$repos->name}) {
             my $language = $self->schema->resultset('RepoLang')->search({repository => $repos->id}, {order_by => 'size'})->first;
             my $lang = $language ? $language->language->name : 'none';
-            my $node = {
-                id => "repos_".$repos->name,
+            $nodes->{$repos->name} = {
+                id => $repos->name,
                 label => $repos->name,
                 attvalues => {
                     attvalue => [
@@ -161,28 +193,118 @@ sub repositories {
                     ],
                 },
             };
-            push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $node;
         }
-        my $e = {
-            source   => $repos->id_profile->id,
-            target   => "repos_".$repos->name,
-            id       => $self->inc_edges,
-        };
-        push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e;
+        my $forks = $self->schema->resultset('Fork')->search({repos => $repos->id});
+        while (my $fork = $forks->next) {
+            my $e = {
+                source   => $fork->profile->id,
+                target   => $fork->repos->name,
+                id       => $self->inc_edges,
+            };
+            push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e;
+        }
     }
+    map {push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $nodes->{$_} keys %$nodes;
+    say "repositories_from_profiles done";
+}
+
+sub stats_languages_by_country {
+    my $self = shift;
+}
+
+sub _get_node_for_profile {
+    my ($self, $profile) = @_;
+    my ($languages, $ordered_languages) = $self->_get_languages_for_profile($profile);
+    my $main_lang = shift @$ordered_languages;
+    my $node = {
+        id              => $profile->id,
+        label           => $profile->login,
+        attvalues => {
+            attvalue => [
+                { for => 0, value => $profile->name},
+                { for => 1, value => "profile"},
+                { for => 2, value => $profile->followers_count},
+                { for => 3, value => $profile->following_count},
+                { for => 5, value => $profile->country},
+                { for => 6, value => $profile->public_gist_count},
+                { for => 7, value => $profile->public_repo_count},
+                { for => 8, value => $main_lang},
+            ]
+        },
+    };
+    return $node;
+}
 
-    my $forks = $self->schema->resultset('Fork')->search();
+sub _get_languages_for_profile {
+    my ($self, $profile) = shift;
 
+    my $forks = $self->schema->resultset('Fork')->search({profile =>
+        $profile->id});
+
+    my %languages;
     while (my $fork = $forks->next) {
-        next if $fork->repos->name =~ /dotfiles/i;
-        my $e = {
-            source   => $fork->profile->id,
-            target   => "repos_".$fork->repos->name,
-            id       => $self->inc_edges,
-        };
-        push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e;
+        my $languages =
+        $self->schema->resultset('RepoLang')->search({repository =>
+                $fork->repos->id});
+        while (my $lang = $languages->next) {
+            $languages{$lang->language->name}+=$lang->size;
+        }
     }
-    say " done";
+    my @sorted_lang = sort {$languages{$b} <=> $languages{$a}} keys %languages;
+    return (\%languages, \@sorted_lang);
 }
 
+#sub repositories {
+#    my $self = shift;
+#
+#    say "start repositories ...";
+#    my $repositories = $self->schema->resultset('Repositories')->search({fork => 0});
+#    while (my $repos = $repositories->next) {
+#
+#        next if $repos->name =~ /dotfiles/i;
+#        # available in forks ?
+#        my $check_fork = $self->schema->resultset('Fork')->search({repos => $repos->id});
+#        next if $check_fork->count < 1;
+#
+#        if (!grep {$_->{id} eq "repos_".$repos->name} @{$self->graph->{gexf}->{graph}->{nodes}->{node}}) {
+#            my $language = $self->schema->resultset('RepoLang')->search({repository => $repos->id}, {order_by => 'size'})->first;
+#            my $lang = $language ? $language->language->name : 'none';
+#            my $node = {
+#                id => "repos_".$repos->name,
+#                label => $repos->name,
+#                attvalues => {
+#                    attvalue => [
+#                        { for => 0,  value => $repos->name},
+#                        { for => 1,  value => "repository"},
+#                        { for => 4,  value => $repos->forks},
+#                        { for => 9,  value => $repos->description},
+#                        { for => 10, value => $repos->watchers},
+#                        { for => 8,  value => $lang},
+#                    ],
+#                },
+#            };
+#            push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $node;
+#        }
+#        my $e = {
+#            source   => $repos->id_profile->id,
+#            target   => "repos_".$repos->name,
+#            id       => $self->inc_edges,
+#        };
+#        push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e;
+#    }
+#
+#    my $forks = $self->schema->resultset('Fork')->search();
+#
+#    while (my $fork = $forks->next) {
+#        next if $fork->repos->name =~ /dotfiles/i;
+#        my $e = {
+#            source   => $fork->profile->id,
+#            target   => "repos_".$fork->repos->name,
+#            id       => $self->inc_edges,
+#        };
+#        push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e;
+#    }
+#    say " done";
+#}
+
 1;
diff --git a/lib/githubexplorer/Schema/Result/Profiles.pm b/lib/githubexplorer/Schema/Result/Profiles.pm
index e0349d7..b43211e 100644
--- a/lib/githubexplorer/Schema/Result/Profiles.pm
+++ b/lib/githubexplorer/Schema/Result/Profiles.pm
@@ -15,6 +15,8 @@ __PACKAGE__->add_columns(
     following_count   => { data_type => 'int' },
     gravatar_id       => { data_type => 'varchar', is_nullable => 1 },
     location          => { data_type => 'varchar', is_nullable => 1 },
+    country          => { data_type => 'varchar', is_nullable => 1 },
+    city          => { data_type => 'varchar', is_nullable => 1 },
     name              => { data_type => 'varchar', is_nullable => 1 },
     public_gist_count => { data_type => 'int' },
     public_repo_count => { data_type => 'int' },