2016-11-23 06:55:40 +00:00
package google
import (
2018-09-10 22:07:48 +00:00
"context"
2016-11-23 06:55:40 +00:00
"fmt"
"log"
2018-05-23 00:45:22 +00:00
"strings"
2016-11-23 06:55:40 +00:00
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
"github.com/hashicorp/errwrap"
2016-11-23 06:55:40 +00:00
"github.com/hashicorp/terraform/helper/schema"
Guard against eventually consistent services
When enabling services, after the waiter returns, list the enabled
services and ensure the ones we enabled are in there. If not, retry. May
not always resolve #1393, but should help. Unfortunately, the real
answer is probably either:
1. For us to try and get the API updated to only return the waiter when
the service will consistently be available. I don't know how feasible
this is, but I'm willing to open a ticket.
2. For us to build retries into ~all our resources to retry for a set
amount of time when a service not enabled error is returned. This would
greatly slow down the provider in the case of the service legitimately
not being enabled, but is how other providers handle this class of
problem.
Unfortunately, due to the eventual consistency at play, this is a hard
issue to reproduce and prove, though it matches with my
experience--while testing this patch, one of the tests failed with the
error that the serviceusage API hadn't been enabled, but only on step 4
of the test, when calls had already succeeded. Which suggests eventual
consistency, to me. Regardless, this patch shouldn't _hurt_ and should
mostly be an imperceptible change to users, and should make instances
like #1393 less likely.
2018-05-23 11:01:05 +00:00
"google.golang.org/api/googleapi"
2018-05-23 00:45:22 +00:00
"google.golang.org/api/serviceusage/v1beta1"
2016-11-23 06:55:40 +00:00
)
func resourceGoogleProjectServices ( ) * schema . Resource {
return & schema . Resource {
Create : resourceGoogleProjectServicesCreate ,
Read : resourceGoogleProjectServicesRead ,
Update : resourceGoogleProjectServicesUpdate ,
Delete : resourceGoogleProjectServicesDelete ,
2017-10-23 21:23:51 +00:00
Importer : & schema . ResourceImporter {
State : schema . ImportStatePassthrough ,
} ,
2016-11-23 06:55:40 +00:00
Schema : map [ string ] * schema . Schema {
"project" : & schema . Schema {
Type : schema . TypeString ,
2018-05-29 21:26:58 +00:00
Optional : true ,
2016-11-23 06:55:40 +00:00
ForceNew : true ,
2018-05-29 21:26:58 +00:00
Computed : true ,
2016-11-23 06:55:40 +00:00
} ,
"services" : {
Type : schema . TypeSet ,
Required : true ,
Set : schema . HashString ,
2018-11-13 23:52:01 +00:00
Elem : & schema . Schema {
Type : schema . TypeString ,
ValidateFunc : StringNotInSlice ( ignoredProjectServices , false ) ,
} ,
2016-11-23 06:55:40 +00:00
} ,
2018-04-04 23:20:13 +00:00
"disable_on_destroy" : & schema . Schema {
Type : schema . TypeBool ,
Optional : true ,
Default : true ,
} ,
2016-11-23 06:55:40 +00:00
} ,
}
}
2018-11-13 23:52:01 +00:00
var ignoredProjectServices = [ ] string { "dataproc-control.googleapis.com" , "source.googleapis.com" , "stackdriverprovisioning.googleapis.com" }
2017-04-27 20:00:54 +00:00
// These services can only be enabled as a side-effect of enabling other services,
// so don't bother storing them in the config or using them for diffing.
2018-11-13 23:52:01 +00:00
var ignoreProjectServices = golangSetFromStringSlice ( ignoredProjectServices )
2017-04-27 20:00:54 +00:00
2016-11-23 06:55:40 +00:00
func resourceGoogleProjectServicesCreate ( d * schema . ResourceData , meta interface { } ) error {
config := meta . ( * Config )
2018-05-29 21:26:58 +00:00
pid , err := getProject ( d , config )
if err != nil {
return err
}
2016-11-23 06:55:40 +00:00
// Get services from config
cfgServices := getConfigServices ( d )
// Get services from API
2017-11-07 23:19:57 +00:00
apiServices , err := getApiServices ( pid , config , ignoreProjectServices )
2016-11-23 06:55:40 +00:00
if err != nil {
return fmt . Errorf ( "Error creating services: %v" , err )
}
// This call disables any APIs that aren't defined in cfgServices,
// and enables all of those that are
err = reconcileServices ( cfgServices , apiServices , config , pid )
if err != nil {
return fmt . Errorf ( "Error creating services: %v" , err )
}
d . SetId ( pid )
return resourceGoogleProjectServicesRead ( d , meta )
}
func resourceGoogleProjectServicesRead ( d * schema . ResourceData , meta interface { } ) error {
config := meta . ( * Config )
2017-11-07 23:19:57 +00:00
services , err := getApiServices ( d . Id ( ) , config , ignoreProjectServices )
2016-11-23 06:55:40 +00:00
if err != nil {
return err
}
2017-10-23 21:23:51 +00:00
d . Set ( "project" , d . Id ( ) )
2016-11-23 06:55:40 +00:00
d . Set ( "services" , services )
return nil
}
func resourceGoogleProjectServicesUpdate ( d * schema . ResourceData , meta interface { } ) error {
log . Printf ( "[DEBUG]: Updating google_project_services" )
config := meta . ( * Config )
// Get services from config
cfgServices := getConfigServices ( d )
// Get services from API
2018-05-29 21:26:58 +00:00
apiServices , err := getApiServices ( d . Id ( ) , config , ignoreProjectServices )
2016-11-23 06:55:40 +00:00
if err != nil {
return fmt . Errorf ( "Error updating services: %v" , err )
}
// This call disables any APIs that aren't defined in cfgServices,
// and enables all of those that are
2018-05-29 21:26:58 +00:00
err = reconcileServices ( cfgServices , apiServices , config , d . Id ( ) )
2016-11-23 06:55:40 +00:00
if err != nil {
return fmt . Errorf ( "Error updating services: %v" , err )
}
return resourceGoogleProjectServicesRead ( d , meta )
}
func resourceGoogleProjectServicesDelete ( d * schema . ResourceData , meta interface { } ) error {
log . Printf ( "[DEBUG]: Deleting google_project_services" )
2018-04-04 23:20:13 +00:00
if disable := d . Get ( "disable_on_destroy" ) ; ! ( disable . ( bool ) ) {
log . Printf ( "Not disabling service '%s', because disable_on_destroy is false." , d . Id ( ) )
d . SetId ( "" )
return nil
}
2016-11-23 06:55:40 +00:00
config := meta . ( * Config )
services := resourceServices ( d )
for _ , s := range services {
disableService ( s , d . Id ( ) , config )
}
d . SetId ( "" )
return nil
}
// This function ensures that the services enabled for a project exactly match that
// in a config by disabling any services that are returned by the API but not present
// in the config
func reconcileServices ( cfgServices , apiServices [ ] string , config * Config , pid string ) error {
// Helper to convert slice to map
m := func ( vals [ ] string ) map [ string ] struct { } {
sm := make ( map [ string ] struct { } )
for _ , s := range vals {
sm [ s ] = struct { } { }
}
return sm
}
cfgMap := m ( cfgServices )
apiMap := m ( apiServices )
for k , _ := range apiMap {
if _ , ok := cfgMap [ k ] ; ! ok {
// The service in the API is not in the config; disable it.
err := disableService ( k , pid , config )
if err != nil {
return err
}
} else {
// The service exists in the config and the API, so we don't need
// to re-enable it
delete ( cfgMap , k )
}
}
2018-05-23 00:45:22 +00:00
keys := make ( [ ] string , 0 , len ( cfgMap ) )
2016-11-23 06:55:40 +00:00
for k , _ := range cfgMap {
2018-05-23 00:45:22 +00:00
keys = append ( keys , k )
}
err := enableServices ( keys , pid , config )
if err != nil {
return err
2016-11-23 06:55:40 +00:00
}
return nil
}
// Retrieve services defined in a config
func getConfigServices ( d * schema . ResourceData ) ( services [ ] string ) {
if v , ok := d . GetOk ( "services" ) ; ok {
for _ , svc := range v . ( * schema . Set ) . List ( ) {
services = append ( services , svc . ( string ) )
}
}
return
}
// Retrieve a project's services from the API
2017-11-07 23:19:57 +00:00
func getApiServices ( pid string , config * Config , ignore map [ string ] struct { } ) ( [ ] string , error ) {
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
if ignore == nil {
ignore = make ( map [ string ] struct { } )
}
2018-05-31 21:50:38 +00:00
var apiServices [ ] string
if err := retryTime ( func ( ) error {
// Reset the list of apiServices in case of a retry. A partial page failure
// could result in duplicate services.
apiServices = make ( [ ] string , 0 , 10 )
ctx := context . Background ( )
return config . clientServiceUsage . Services .
List ( "projects/" + pid ) .
2018-07-06 19:37:19 +00:00
Fields ( "services/name,nextPageToken" ) .
2018-05-31 21:50:38 +00:00
Filter ( "state:ENABLED" ) .
Pages ( ctx , func ( r * serviceusage . ListServicesResponse ) error {
for _ , v := range r . Services {
// services are returned as "projects/PROJECT/services/NAME"
parts := strings . Split ( v . Name , "/" )
if len ( parts ) > 0 {
name := parts [ len ( parts ) - 1 ]
if _ , ok := ignore [ name ] ; ! ok {
apiServices = append ( apiServices , name )
}
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
}
}
2018-05-31 21:50:38 +00:00
return nil
} )
} , 10 ) ; err != nil {
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
return nil , errwrap . Wrapf ( "failed to list services: {{err}}" , err )
2016-11-23 06:55:40 +00:00
}
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
2016-11-23 06:55:40 +00:00
return apiServices , nil
}
func enableService ( s , pid string , config * Config ) error {
2018-05-23 00:45:22 +00:00
return enableServices ( [ ] string { s } , pid , config )
}
func enableServices ( s [ ] string , pid string , config * Config ) error {
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
// It's not permitted to enable more than 20 services in one API call (even
// for batch).
//
// https://godoc.org/google.golang.org/api/serviceusage/v1beta1#BatchEnableServicesRequest
batchSize := 20
for i := 0 ; i < len ( s ) ; i += batchSize {
j := i + batchSize
if j > len ( s ) {
j = len ( s )
2018-05-23 00:45:22 +00:00
}
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
services := s [ i : j ]
if err := retryTime ( func ( ) error {
var sop * serviceusage . Operation
var err error
if len ( services ) < 1 {
// No more services to enable
return nil
} else if len ( services ) == 1 {
// Use the singular enable - can't use batch for a single item
name := fmt . Sprintf ( "projects/%s/services/%s" , pid , services [ 0 ] )
req := & serviceusage . EnableServiceRequest { }
sop , err = config . clientServiceUsage . Services . Enable ( name , req ) . Do ( )
} else {
// Batch enable 2+ services
name := fmt . Sprintf ( "projects/%s" , pid )
req := & serviceusage . BatchEnableServicesRequest { ServiceIds : services }
sop , err = config . clientServiceUsage . Services . BatchEnable ( name , req ) . Do ( )
}
if err != nil {
// Check for a "precondition failed" error. The API seems to randomly
// (although more than 50%) return this error when enabling certain
// APIs. It's transient, so we catch it and re-raise it as an error that
// is retryable instead.
if gerr , ok := err . ( * googleapi . Error ) ; ok {
if ( gerr . Code == 400 || gerr . Code == 412 ) && gerr . Message == "Precondition check failed." {
return & googleapi . Error {
Code : 503 ,
Message : "api returned \"precondition failed\" while enabling service" ,
}
}
Guard against eventually consistent services
When enabling services, after the waiter returns, list the enabled
services and ensure the ones we enabled are in there. If not, retry. May
not always resolve #1393, but should help. Unfortunately, the real
answer is probably either:
1. For us to try and get the API updated to only return the waiter when
the service will consistently be available. I don't know how feasible
this is, but I'm willing to open a ticket.
2. For us to build retries into ~all our resources to retry for a set
amount of time when a service not enabled error is returned. This would
greatly slow down the provider in the case of the service legitimately
not being enabled, but is how other providers handle this class of
problem.
Unfortunately, due to the eventual consistency at play, this is a hard
issue to reproduce and prove, though it matches with my
experience--while testing this patch, one of the tests failed with the
error that the serviceusage API hadn't been enabled, but only on step 4
of the test, when calls had already succeeded. Which suggests eventual
consistency, to me. Regardless, this patch shouldn't _hurt_ and should
mostly be an imperceptible change to users, and should make instances
like #1393 less likely.
2018-05-23 11:01:05 +00:00
}
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
return errwrap . Wrapf ( "failed to issue request: {{err}}" , err )
Guard against eventually consistent services
When enabling services, after the waiter returns, list the enabled
services and ensure the ones we enabled are in there. If not, retry. May
not always resolve #1393, but should help. Unfortunately, the real
answer is probably either:
1. For us to try and get the API updated to only return the waiter when
the service will consistently be available. I don't know how feasible
this is, but I'm willing to open a ticket.
2. For us to build retries into ~all our resources to retry for a set
amount of time when a service not enabled error is returned. This would
greatly slow down the provider in the case of the service legitimately
not being enabled, but is how other providers handle this class of
problem.
Unfortunately, due to the eventual consistency at play, this is a hard
issue to reproduce and prove, though it matches with my
experience--while testing this patch, one of the tests failed with the
error that the serviceusage API hadn't been enabled, but only on step 4
of the test, when calls had already succeeded. Which suggests eventual
consistency, to me. Regardless, this patch shouldn't _hurt_ and should
mostly be an imperceptible change to users, and should make instances
like #1393 less likely.
2018-05-23 11:01:05 +00:00
}
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
// Poll for the API to return
activity := fmt . Sprintf ( "apis %q to be enabled for %s" , services , pid )
_ , waitErr := serviceUsageOperationWait ( config , sop , activity )
if waitErr != nil {
return waitErr
Guard against eventually consistent services
When enabling services, after the waiter returns, list the enabled
services and ensure the ones we enabled are in there. If not, retry. May
not always resolve #1393, but should help. Unfortunately, the real
answer is probably either:
1. For us to try and get the API updated to only return the waiter when
the service will consistently be available. I don't know how feasible
this is, but I'm willing to open a ticket.
2. For us to build retries into ~all our resources to retry for a set
amount of time when a service not enabled error is returned. This would
greatly slow down the provider in the case of the service legitimately
not being enabled, but is how other providers handle this class of
problem.
Unfortunately, due to the eventual consistency at play, this is a hard
issue to reproduce and prove, though it matches with my
experience--while testing this patch, one of the tests failed with the
error that the serviceusage API hadn't been enabled, but only on step 4
of the test, when calls had already succeeded. Which suggests eventual
consistency, to me. Regardless, this patch shouldn't _hurt_ and should
mostly be an imperceptible change to users, and should make instances
like #1393 less likely.
2018-05-23 11:01:05 +00:00
}
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
// Accumulate the list of services that are enabled on the project
enabledServices , err := getApiServices ( pid , config , nil )
if err != nil {
return err
Guard against eventually consistent services
When enabling services, after the waiter returns, list the enabled
services and ensure the ones we enabled are in there. If not, retry. May
not always resolve #1393, but should help. Unfortunately, the real
answer is probably either:
1. For us to try and get the API updated to only return the waiter when
the service will consistently be available. I don't know how feasible
this is, but I'm willing to open a ticket.
2. For us to build retries into ~all our resources to retry for a set
amount of time when a service not enabled error is returned. This would
greatly slow down the provider in the case of the service legitimately
not being enabled, but is how other providers handle this class of
problem.
Unfortunately, due to the eventual consistency at play, this is a hard
issue to reproduce and prove, though it matches with my
experience--while testing this patch, one of the tests failed with the
error that the serviceusage API hadn't been enabled, but only on step 4
of the test, when calls had already succeeded. Which suggests eventual
consistency, to me. Regardless, this patch shouldn't _hurt_ and should
mostly be an imperceptible change to users, and should make instances
like #1393 less likely.
2018-05-23 11:01:05 +00:00
}
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
// Diff the list of requested services to enable against the list of
// services on the project.
missing := diffStringSlice ( services , enabledServices )
// If there are any missing, force a retry
if len ( missing ) > 0 {
// Spoof a googleapi Error so retryTime will try again
return & googleapi . Error {
Code : 503 ,
Message : fmt . Sprintf ( "The service(s) %q are still being enabled for project %s. This isn't a real API error, this is just eventual consistency." , missing , pid ) ,
}
}
return nil
} , 10 ) ; err != nil {
return errwrap . Wrap ( err , fmt . Errorf ( "failed to enable service(s) %q for project %s" , services , pid ) )
Guard against eventually consistent services
When enabling services, after the waiter returns, list the enabled
services and ensure the ones we enabled are in there. If not, retry. May
not always resolve #1393, but should help. Unfortunately, the real
answer is probably either:
1. For us to try and get the API updated to only return the waiter when
the service will consistently be available. I don't know how feasible
this is, but I'm willing to open a ticket.
2. For us to build retries into ~all our resources to retry for a set
amount of time when a service not enabled error is returned. This would
greatly slow down the provider in the case of the service legitimately
not being enabled, but is how other providers handle this class of
problem.
Unfortunately, due to the eventual consistency at play, this is a hard
issue to reproduce and prove, though it matches with my
experience--while testing this patch, one of the tests failed with the
error that the serviceusage API hadn't been enabled, but only on step 4
of the test, when calls had already succeeded. Which suggests eventual
consistency, to me. Regardless, this patch shouldn't _hurt_ and should
mostly be an imperceptible change to users, and should make instances
like #1393 less likely.
2018-05-23 11:01:05 +00:00
}
2016-11-23 06:55:40 +00:00
}
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
2016-11-23 06:55:40 +00:00
return nil
}
2017-11-07 23:19:57 +00:00
Lions, tigers, and services being enabled with "precondition failed", oh my! (#1565)
* Use errwrap to retain original error
* Use built-in Page function, only return names when listing services
This removes the custom logic on pagination and uses the built-in Page function in the SDK to make things a bit simpler. Additionally, I added a field filter to only return service names, which drastically reduces the size of the API call (important for slow connections, given how frequently this function is executed).
Also added errwrap to better trace where errors originate.
* Add helper function for diffing string slices
This just looked really nasty inline
* Batch 20 services at a time, handle precondition failed, better errwrap
This commit does three things:
1. It batches services to be enabled 20 at a time. The API fails if you try to enable more than 20 services, and this is documented in the SDK and API. I learned this the hard way. I think Terraform should "do the right thing" here and batch them in series' of twenty, which is what this does. Each batch is tried in serial, but I think making it parallelized is not worth the complexity tradeoffs.
2. Handle the precondition failed error that occurs randomly. This just started happened, but it affects at least two APIs consistently, and a rudimentary test showed that it failed 78% of the time (78/100 times in an hour). We should fix this upstream, but that failure rate also necessitates (in my opinion) some mitigation on the Terraform side until a fix is in place at the API level.
3. Use errwrap on errors for better tracing. It was really difficult to trace exactly which error was being throw. That's fixed.
* Updates from code review
2018-05-31 16:26:40 +00:00
func diffStringSlice ( wanted , actual [ ] string ) [ ] string {
var missing [ ] string
for _ , want := range wanted {
found := false
for _ , act := range actual {
if want == act {
found = true
break
}
}
if ! found {
missing = append ( missing , want )
}
}
return missing
}
2016-11-23 06:55:40 +00:00
func disableService ( s , pid string , config * Config ) error {
2017-11-14 19:41:57 +00:00
err := retryTime ( func ( ) error {
2018-05-23 00:45:22 +00:00
name := fmt . Sprintf ( "projects/%s/services/%s" , pid , s )
sop , err := config . clientServiceUsage . Services . Disable ( name , & serviceusage . DisableServiceRequest { } ) . Do ( )
2017-11-07 23:19:57 +00:00
if err != nil {
return err
}
// Wait for the operation to complete
2018-05-23 00:45:22 +00:00
_ , waitErr := serviceUsageOperationWait ( config , sop , "api to disable" )
2017-11-07 23:19:57 +00:00
if waitErr != nil {
return waitErr
}
return nil
2017-11-14 19:41:57 +00:00
} , 10 )
2016-11-23 06:55:40 +00:00
if err != nil {
return fmt . Errorf ( "Error disabling service %q for project %q: %v" , s , pid , err )
}
return nil
}
func resourceServices ( d * schema . ResourceData ) [ ] string {
// Calculate the tags
var services [ ] string
if s := d . Get ( "services" ) ; s != nil {
ss := s . ( * schema . Set )
services = make ( [ ] string , ss . Len ( ) )
for i , v := range ss . List ( ) {
services [ i ] = v . ( string )
}
}
return services
}